This commit is contained in:
Victor Ng 2018-02-01 17:06:45 -05:00
Родитель 4f79867117
Коммит 0b644246c0
3 изменённых файлов: 11 добавлений и 17 удалений

Просмотреть файл

@ -12,6 +12,9 @@ from datetime import timedelta
import taar_loader
from pyspark import SparkConf
from pyspark.sql import SparkSession
def parse_args():
def valid_date_type(arg_date_str):
@ -41,4 +44,8 @@ def parse_args():
if __name__ == "__main__":
args = parse_args()
taar_loader.runme(args.run_date)
APP_NAME = "HBaseAddonRecommenderView"
conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster("local[*]")
spark = SparkSession.builder.config(conf=conf).getOrCreate()
taar_loader.main(spark, args.run_date)

Просмотреть файл

@ -1,2 +1,2 @@
from taar_dynamo import runme
from filters import dynamo_reducer
from .taar_dynamo import main
from .filters import dynamo_reducer

Просмотреть файл

@ -8,8 +8,6 @@ This module replicates the scala script over at
https://github.com/mozilla/telemetry-batch-view/blob/1c544f65ad2852703883fe31a9fba38c39e75698/src/main/scala/com/mozilla/telemetry/views/HBaseAddonRecommenderView.scala
"""
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc, row_number
from pyspark.sql import Window
@ -23,10 +21,7 @@ def etl(spark, run_date, dataFrameFunc):
print("Processing %s" % currentDateString)
# Get the data for the desired date out of the dataframe
tmp = dataFrameFunc(currentDateString)
# TODO: Remove downsampling later
datasetForDate = tmp.sample(False, 0.00000001)
datasetForDate = dataFrameFunc(currentDateString)
print("Dataset is sampled!")
@ -124,11 +119,3 @@ def reducer(tuple_a, tuple_b):
tuple_data = tuple(tuple_data)
return tuple_data
def runme(run_date):
APP_NAME = "HBaseAddonRecommenderView"
conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster("local[*]")
sparkSession = SparkSession.builder.config(conf=conf).getOrCreate()
return main(sparkSession, run_date)