DENG-1025: gfx_telemetry r/w to GCS instead of S3

This commit is contained in:
Eduardo Filho 2023-08-03 17:11:52 -04:00
Родитель dc0be49a84
Коммит 78218d3274
3 изменённых файлов: 46 добавлений и 39 удалений

Просмотреть файл

@ -2,7 +2,6 @@
# pip install:
# python_moztelemetry
# git+https://github.com/FirefoxGraphics/telemetry.git#egg=pkg&subdirectory=analyses/bigquery_shim
# boto3==1.16.20
# six==1.15.0
import argparse
@ -12,12 +11,13 @@ import os
import sys
import time
import boto3
from bigquery_shim import dashboard, snake_case
from moztelemetry import get_pings_properties
from pyspark import SparkContext
from pyspark.sql import SparkSession
from google.cloud import storage
def fmt_date(d):
return d.strftime("%Y%m%d")
@ -27,7 +27,7 @@ def repartition(pipeline):
return pipeline.repartition(MaxPartitions).cache()
s3_client = boto3.client("s3")
storage_client = storage.Client()
sc = SparkContext.getOrCreate()
spark = SparkSession.builder.appName("graphics-trends").getOrCreate()
@ -45,7 +45,9 @@ def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--default-time-window", type=int, default=14)
parser.add_argument("--release-fraction", type=float, default=0.003)
parser.add_argument("--output-bucket", default="telemetry-public-analysis-2")
parser.add_argument(
"--output-bucket", default="moz-fx-data-static-websit-f7e0-analysis-output"
)
parser.add_argument("--output-prefix", default="gfx/telemetry-data/")
return parser.parse_args()
@ -61,12 +63,12 @@ OUTPUT_PREFIX = args.output_prefix
DefaultTimeWindow = args.default_time_window
ReleaseFraction = args.release_fraction
existing_objects = [
obj["Key"]
for obj in s3_client.list_objects_v2(Bucket=OUTPUT_BUCKET, Prefix=OUTPUT_PREFIX)[
"Contents"
]
]
bucket = storage_client.get_bucket(OUTPUT_BUCKET)
# List all objects in the GCS bucket with the given prefix
blobs = bucket.list_blobs(prefix=OUTPUT_PREFIX)
# Extract object names from the Blob objects
existing_objects = [blob.name for blob in blobs]
print(f"Existing objects: {existing_objects}")
# List of keys for properties on session pings that we care about.
@ -324,10 +326,14 @@ def export(filename, obj, **kwargs):
full_filename = os.path.join(OUTPUT_PREFIX, f"{filename}.json")
print("Writing to {0}".format(full_filename))
# serialize snake case dicts via their underlying dict
s3_client.put_object(
Bucket=OUTPUT_BUCKET,
Key=full_filename,
Body=bytes(json.dumps(obj, cls=snake_case.SnakeCaseEncoder), encoding="utf-8"),
bucket = storage_client.get_bucket(OUTPUT_BUCKET)
blob = bucket.blob(full_filename)
# Convert the 'obj' dictionary to JSON and upload it to GCS
# serialize snake case dicts via their underlying dict
blob.upload_from_string(
json.dumps(obj, cls=snake_case.SnakeCaseEncoder),
content_type="application/json",
)

Просмотреть файл

@ -12,13 +12,14 @@ import os
import sys
import time
import boto3
import requests
from bigquery_shim import trends
from moztelemetry import get_one_ping_per_client
from pyspark import SparkContext
from pyspark.sql import SparkSession
from google.cloud import storage
def fmt_date(d):
return d.strftime("%Y%m%d")
@ -32,7 +33,7 @@ def repartition(pipeline):
return pipeline.repartition(MaxPartitions).cache()
s3_client = boto3.client("s3")
storage_client = storage.Client()
sc = SparkContext.getOrCreate()
spark = SparkSession.builder.appName("graphics-trends").getOrCreate()
@ -45,13 +46,13 @@ WeeklyFraction = 0.003
# Amount of days Telemetry keeps.
MaxHistoryInDays = datetime.timedelta(days=210)
# Bucket we'll drop files into on S3. If this is None, we won't attempt any
# S3 uploads, and the analysis will start from scratch.
S3_BUCKET = "telemetry-public-analysis-2"
S3_PREFIX = "gfx/telemetry-data/"
# Bucket we'll drop files into on GCS. If this is None, we won't attempt any
# GCS uploads, and the analysis will start from scratch.
GCS_BUCKET = "moz-fx-data-static-websit-f7e0-analysis-output"
GCS_PREFIX = "gfx/telemetry-data/"
GITHUB_REPO = "https://raw.githubusercontent.com/FirefoxGraphics/moz-gfx-telemetry"
# List of jobs allowed to have a first-run (meaning no S3 content).
# List of jobs allowed to have a first-run (meaning no GCS content).
BrandNewJobs = []
# If true, backfill up to MaxHistoryInDays rather than the last update.
@ -329,7 +330,7 @@ class TrendGroup(TrendBase):
# A Trend object takes a new set of pings for a week's worth of data,
# analyzes it, and adds the result to the trend set. Trend sets are
# cached in S3 as JSON.
# cached in GCS as JSON.
#
# If the latest entry in the cache covers less than a full week of
# data, the entry is removed so that week can be re-queried.
@ -431,25 +432,22 @@ class Trend(TrendBase):
with open(self.local_path, "w") as fp:
fp.write(text)
if S3_BUCKET is not None:
if GCS_BUCKET is not None:
try:
s3_client.upload_file(
Filename=self.local_path,
Bucket=S3_BUCKET,
Key=os.path.join(S3_PREFIX, self.name),
)
bucket = storage_client.get_bucket(GCS_BUCKET)
blob = bucket.blob(os.path.join(GCS_PREFIX, self.name))
blob.upload_from_filename(self.local_path)
except Exception as e:
print("Failed s3 upload: {0}".format(e))
print("Failed gcs upload: {0}".format(e))
def fetch_json(self):
print("Reading file {0}".format(self.local_path))
if S3_BUCKET is not None:
if GCS_BUCKET is not None:
try:
s3_client.download_file(
Bucket=S3_BUCKET,
Key=os.path.join(S3_PREFIX, self.name),
Filename=self.local_path,
)
storage_client = storage.Client()
bucket = storage_client.get_bucket(GCS_BUCKET)
blob = bucket.blob(os.path.join(GCS_PREFIX, self.name))
blob.download_to_filename(self.local_path)
with open(self.local_path, "r") as fp:
return json.load(fp)
except Exception:
@ -595,8 +593,10 @@ def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--force-max-backfill", action="store_true")
parser.add_argument("--weekly-fraction", type=float, default=0.003)
parser.add_argument("--s3-bucket", default="telemetry-public-analysis-2")
parser.add_argument("--s3-prefix", default="gfx/telemetry-data/")
parser.add_argument(
"--gcs-bucket", default="moz-fx-data-static-websit-f7e0-analysis-output"
)
parser.add_argument("--gcs-prefix", default="gfx/telemetry-data/")
parser.add_argument("--max-history-in-days", type=int, default=210)
parser.add_argument("--brand-new-jobs", action="append", default=[])
return parser.parse_args()
@ -606,8 +606,8 @@ if __name__ == "__main__":
args = parse_args()
ForceMaxBackfill = args.force_max_backfill
WeeklyFraction = args.weekly_fraction
S3_BUCKET = args.s3_bucket
S3_PREFIX = args.s3_prefix
GCS_BUCKET = args.gcs_bucket
GCS_PREFIX = args.gcs_prefix
MaxHistoryInDays = datetime.timedelta(days=args.max_history_in_days)
BrandNewJobs = args.brand_new_jobs

Просмотреть файл

@ -33,6 +33,7 @@ setup(
'botocore==1.19.20',
'click==7.1.2',
'click_datetime==0.2',
'google-cloud-storage==2.7.0',
'numpy==1.19.4',
'pandas==1.1.4',
# NOTE: this is pinned to 3.20.3 because protos changed and we can't regenerate them.