Port bhr collection from databricks (#356)

2020-12-17 18:01:18 -05:00 · 2020-12-17 18:01:18 -05:00 · c183d901e8
--- a/bin/mozetl-submit-dataproc.sh
+++ b/bin/mozetl-submit-dataproc.sh
@ -19,4 +19,5 @@ fi
 gcloud dataproc jobs submit pyspark $2 \
    --cluster=$1 \
    --region=us-central1 \
-    --jars 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar'
+    --jars 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar' \
+    -- ${@:3:$#}
--- a/mozetl/bhr_collection/README.md
+++ b/mozetl/bhr_collection/README.md
@ -0,0 +1,14 @@
+# bhr collection
+
+To run this job manually, first create the dataproc cluster using the `create_cluster.sh` script.
+This script requires `$AWS_ACCESS_KEY_ID` and `$AWS_SECRET_ACCESS_KEY` to be defined.
+
+The job can be submitted to the cluster using gcloud:
+
+```sh
+gcloud dataproc jobs submit pyspark bhr_collection.py \
+    --cluster=bhr-collection \
+    --region=us-central1 \
+    --jars 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar' \
+    -- --date=2020-12-31 --sample-size=0.01
+```
--- a/mozetl/bhr_collection/init.py
+++ b/mozetl/bhr_collection/init.py
--- a/mozetl/bhr_collection/bhr_collection.py
+++ b/mozetl/bhr_collection/bhr_collection.py
--- a/mozetl/bhr_collection/create_cluster.sh
+++ b/mozetl/bhr_collection/create_cluster.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Create cluster for testing
+
+if [[ -z $AWS_ACCESS_KEY_ID ]]; then
+    echo '$AWS_ACCESS_KEY_ID not defined'
+    exit 1
+fi
+if [[ -z $AWS_SECRET_ACCESS_KEY ]]; then
+    echo '$AWS_SECRET_ACCESS_KEY not defined'
+    exit 1
+fi
+
+gcloud dataproc clusters create bhr-collection \
+    --image-version=1.5 \
+    --region=us-central1 \
+    --metadata='PIP_PACKAGES=boto3==1.16.20 click==7.1.2' \
+    --num-workers=5 \
+    --worker-machine-type='n2-highmem-4' \
+    --properties "core:fs.s3.awsAccessKeyId=$AWS_ACCESS_KEY_ID,core:fs.s3.awsSecretAccessKey=$AWS_SECRET_ACCESS_KEY,spark-env:AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,spark-env:AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" \
+    --initialization-actions='gs://dataproc-initialization-actions/python/pip-install.sh'