From b832abd81d6ce5116879e6d01935bb311ddd381c Mon Sep 17 00:00:00 2001 From: Anthony Miyaguchi Date: Thu, 21 Feb 2019 13:12:41 -0800 Subject: [PATCH] Optimize system_check to prevent large dataset scans --- mozetl/system_check.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mozetl/system_check.py b/mozetl/system_check.py index 8bc1cd3..f70c9a1 100644 --- a/mozetl/system_check.py +++ b/mozetl/system_check.py @@ -55,10 +55,8 @@ def main( ) ) - main_summary = spark.read.parquet(input_path) - subset = main_summary.where( - "submission_date_s3 = '{}'".format(ds_nodash) - ).where("sample_id='{}'".format(1)) + path = "{}/submission_date_s3={}/sample_id={}".format(input_path, ds_nodash, 1) + subset = spark.read.parquet(path) print("Saw {} documents".format(subset.count())) summary = subset.select(