DENG-1025: gfx_telemetry r/w to GCS instead of S3
This commit is contained in:
Родитель
dc0be49a84
Коммит
78218d3274
|
@ -2,7 +2,6 @@
|
|||
# pip install:
|
||||
# python_moztelemetry
|
||||
# git+https://github.com/FirefoxGraphics/telemetry.git#egg=pkg&subdirectory=analyses/bigquery_shim
|
||||
# boto3==1.16.20
|
||||
# six==1.15.0
|
||||
|
||||
import argparse
|
||||
|
@ -12,12 +11,13 @@ import os
|
|||
import sys
|
||||
import time
|
||||
|
||||
import boto3
|
||||
from bigquery_shim import dashboard, snake_case
|
||||
from moztelemetry import get_pings_properties
|
||||
from pyspark import SparkContext
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
from google.cloud import storage
|
||||
|
||||
|
||||
def fmt_date(d):
|
||||
return d.strftime("%Y%m%d")
|
||||
|
@ -27,7 +27,7 @@ def repartition(pipeline):
|
|||
return pipeline.repartition(MaxPartitions).cache()
|
||||
|
||||
|
||||
s3_client = boto3.client("s3")
|
||||
storage_client = storage.Client()
|
||||
|
||||
sc = SparkContext.getOrCreate()
|
||||
spark = SparkSession.builder.appName("graphics-trends").getOrCreate()
|
||||
|
@ -45,7 +45,9 @@ def parse_args():
|
|||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--default-time-window", type=int, default=14)
|
||||
parser.add_argument("--release-fraction", type=float, default=0.003)
|
||||
parser.add_argument("--output-bucket", default="telemetry-public-analysis-2")
|
||||
parser.add_argument(
|
||||
"--output-bucket", default="moz-fx-data-static-websit-f7e0-analysis-output"
|
||||
)
|
||||
parser.add_argument("--output-prefix", default="gfx/telemetry-data/")
|
||||
return parser.parse_args()
|
||||
|
||||
|
@ -61,12 +63,12 @@ OUTPUT_PREFIX = args.output_prefix
|
|||
DefaultTimeWindow = args.default_time_window
|
||||
ReleaseFraction = args.release_fraction
|
||||
|
||||
existing_objects = [
|
||||
obj["Key"]
|
||||
for obj in s3_client.list_objects_v2(Bucket=OUTPUT_BUCKET, Prefix=OUTPUT_PREFIX)[
|
||||
"Contents"
|
||||
]
|
||||
]
|
||||
bucket = storage_client.get_bucket(OUTPUT_BUCKET)
|
||||
# List all objects in the GCS bucket with the given prefix
|
||||
blobs = bucket.list_blobs(prefix=OUTPUT_PREFIX)
|
||||
# Extract object names from the Blob objects
|
||||
existing_objects = [blob.name for blob in blobs]
|
||||
|
||||
print(f"Existing objects: {existing_objects}")
|
||||
|
||||
# List of keys for properties on session pings that we care about.
|
||||
|
@ -324,10 +326,14 @@ def export(filename, obj, **kwargs):
|
|||
full_filename = os.path.join(OUTPUT_PREFIX, f"{filename}.json")
|
||||
print("Writing to {0}".format(full_filename))
|
||||
# serialize snake case dicts via their underlying dict
|
||||
s3_client.put_object(
|
||||
Bucket=OUTPUT_BUCKET,
|
||||
Key=full_filename,
|
||||
Body=bytes(json.dumps(obj, cls=snake_case.SnakeCaseEncoder), encoding="utf-8"),
|
||||
bucket = storage_client.get_bucket(OUTPUT_BUCKET)
|
||||
blob = bucket.blob(full_filename)
|
||||
|
||||
# Convert the 'obj' dictionary to JSON and upload it to GCS
|
||||
# serialize snake case dicts via their underlying dict
|
||||
blob.upload_from_string(
|
||||
json.dumps(obj, cls=snake_case.SnakeCaseEncoder),
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -12,13 +12,14 @@ import os
|
|||
import sys
|
||||
import time
|
||||
|
||||
import boto3
|
||||
import requests
|
||||
from bigquery_shim import trends
|
||||
from moztelemetry import get_one_ping_per_client
|
||||
from pyspark import SparkContext
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
from google.cloud import storage
|
||||
|
||||
|
||||
def fmt_date(d):
|
||||
return d.strftime("%Y%m%d")
|
||||
|
@ -32,7 +33,7 @@ def repartition(pipeline):
|
|||
return pipeline.repartition(MaxPartitions).cache()
|
||||
|
||||
|
||||
s3_client = boto3.client("s3")
|
||||
storage_client = storage.Client()
|
||||
|
||||
sc = SparkContext.getOrCreate()
|
||||
spark = SparkSession.builder.appName("graphics-trends").getOrCreate()
|
||||
|
@ -45,13 +46,13 @@ WeeklyFraction = 0.003
|
|||
# Amount of days Telemetry keeps.
|
||||
MaxHistoryInDays = datetime.timedelta(days=210)
|
||||
|
||||
# Bucket we'll drop files into on S3. If this is None, we won't attempt any
|
||||
# S3 uploads, and the analysis will start from scratch.
|
||||
S3_BUCKET = "telemetry-public-analysis-2"
|
||||
S3_PREFIX = "gfx/telemetry-data/"
|
||||
# Bucket we'll drop files into on GCS. If this is None, we won't attempt any
|
||||
# GCS uploads, and the analysis will start from scratch.
|
||||
GCS_BUCKET = "moz-fx-data-static-websit-f7e0-analysis-output"
|
||||
GCS_PREFIX = "gfx/telemetry-data/"
|
||||
GITHUB_REPO = "https://raw.githubusercontent.com/FirefoxGraphics/moz-gfx-telemetry"
|
||||
|
||||
# List of jobs allowed to have a first-run (meaning no S3 content).
|
||||
# List of jobs allowed to have a first-run (meaning no GCS content).
|
||||
BrandNewJobs = []
|
||||
|
||||
# If true, backfill up to MaxHistoryInDays rather than the last update.
|
||||
|
@ -329,7 +330,7 @@ class TrendGroup(TrendBase):
|
|||
|
||||
# A Trend object takes a new set of pings for a week's worth of data,
|
||||
# analyzes it, and adds the result to the trend set. Trend sets are
|
||||
# cached in S3 as JSON.
|
||||
# cached in GCS as JSON.
|
||||
#
|
||||
# If the latest entry in the cache covers less than a full week of
|
||||
# data, the entry is removed so that week can be re-queried.
|
||||
|
@ -431,25 +432,22 @@ class Trend(TrendBase):
|
|||
with open(self.local_path, "w") as fp:
|
||||
fp.write(text)
|
||||
|
||||
if S3_BUCKET is not None:
|
||||
if GCS_BUCKET is not None:
|
||||
try:
|
||||
s3_client.upload_file(
|
||||
Filename=self.local_path,
|
||||
Bucket=S3_BUCKET,
|
||||
Key=os.path.join(S3_PREFIX, self.name),
|
||||
)
|
||||
bucket = storage_client.get_bucket(GCS_BUCKET)
|
||||
blob = bucket.blob(os.path.join(GCS_PREFIX, self.name))
|
||||
blob.upload_from_filename(self.local_path)
|
||||
except Exception as e:
|
||||
print("Failed s3 upload: {0}".format(e))
|
||||
print("Failed gcs upload: {0}".format(e))
|
||||
|
||||
def fetch_json(self):
|
||||
print("Reading file {0}".format(self.local_path))
|
||||
if S3_BUCKET is not None:
|
||||
if GCS_BUCKET is not None:
|
||||
try:
|
||||
s3_client.download_file(
|
||||
Bucket=S3_BUCKET,
|
||||
Key=os.path.join(S3_PREFIX, self.name),
|
||||
Filename=self.local_path,
|
||||
)
|
||||
storage_client = storage.Client()
|
||||
bucket = storage_client.get_bucket(GCS_BUCKET)
|
||||
blob = bucket.blob(os.path.join(GCS_PREFIX, self.name))
|
||||
blob.download_to_filename(self.local_path)
|
||||
with open(self.local_path, "r") as fp:
|
||||
return json.load(fp)
|
||||
except Exception:
|
||||
|
@ -595,8 +593,10 @@ def parse_args():
|
|||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--force-max-backfill", action="store_true")
|
||||
parser.add_argument("--weekly-fraction", type=float, default=0.003)
|
||||
parser.add_argument("--s3-bucket", default="telemetry-public-analysis-2")
|
||||
parser.add_argument("--s3-prefix", default="gfx/telemetry-data/")
|
||||
parser.add_argument(
|
||||
"--gcs-bucket", default="moz-fx-data-static-websit-f7e0-analysis-output"
|
||||
)
|
||||
parser.add_argument("--gcs-prefix", default="gfx/telemetry-data/")
|
||||
parser.add_argument("--max-history-in-days", type=int, default=210)
|
||||
parser.add_argument("--brand-new-jobs", action="append", default=[])
|
||||
return parser.parse_args()
|
||||
|
@ -606,8 +606,8 @@ if __name__ == "__main__":
|
|||
args = parse_args()
|
||||
ForceMaxBackfill = args.force_max_backfill
|
||||
WeeklyFraction = args.weekly_fraction
|
||||
S3_BUCKET = args.s3_bucket
|
||||
S3_PREFIX = args.s3_prefix
|
||||
GCS_BUCKET = args.gcs_bucket
|
||||
GCS_PREFIX = args.gcs_prefix
|
||||
MaxHistoryInDays = datetime.timedelta(days=args.max_history_in_days)
|
||||
BrandNewJobs = args.brand_new_jobs
|
||||
|
||||
|
|
1
setup.py
1
setup.py
|
@ -33,6 +33,7 @@ setup(
|
|||
'botocore==1.19.20',
|
||||
'click==7.1.2',
|
||||
'click_datetime==0.2',
|
||||
'google-cloud-storage==2.7.0',
|
||||
'numpy==1.19.4',
|
||||
'pandas==1.1.4',
|
||||
# NOTE: this is pinned to 3.20.3 because protos changed and we can't regenerate them.
|
||||
|
|
Загрузка…
Ссылка в новой задаче