Port bhr collection from databricks (#356)
This commit is contained in:
Родитель
43ffb961db
Коммит
c183d901e8
|
@ -19,4 +19,5 @@ fi
|
|||
gcloud dataproc jobs submit pyspark $2 \
|
||||
--cluster=$1 \
|
||||
--region=us-central1 \
|
||||
--jars 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar'
|
||||
--jars 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar' \
|
||||
-- ${@:3:$#}
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
# bhr collection
|
||||
|
||||
To run this job manually, first create the dataproc cluster using the `create_cluster.sh` script.
|
||||
This script requires `$AWS_ACCESS_KEY_ID` and `$AWS_SECRET_ACCESS_KEY` to be defined.
|
||||
|
||||
The job can be submitted to the cluster using gcloud:
|
||||
|
||||
```sh
|
||||
gcloud dataproc jobs submit pyspark bhr_collection.py \
|
||||
--cluster=bhr-collection \
|
||||
--region=us-central1 \
|
||||
--jars 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar' \
|
||||
-- --date=2020-12-31 --sample-size=0.01
|
||||
```
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Create cluster for testing
|
||||
|
||||
if [[ -z $AWS_ACCESS_KEY_ID ]]; then
|
||||
echo '$AWS_ACCESS_KEY_ID not defined'
|
||||
exit 1
|
||||
fi
|
||||
if [[ -z $AWS_SECRET_ACCESS_KEY ]]; then
|
||||
echo '$AWS_SECRET_ACCESS_KEY not defined'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
gcloud dataproc clusters create bhr-collection \
|
||||
--image-version=1.5 \
|
||||
--region=us-central1 \
|
||||
--metadata='PIP_PACKAGES=boto3==1.16.20 click==7.1.2' \
|
||||
--num-workers=5 \
|
||||
--worker-machine-type='n2-highmem-4' \
|
||||
--properties "core:fs.s3.awsAccessKeyId=$AWS_ACCESS_KEY_ID,core:fs.s3.awsSecretAccessKey=$AWS_SECRET_ACCESS_KEY,spark-env:AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,spark-env:AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" \
|
||||
--initialization-actions='gs://dataproc-initialization-actions/python/pip-install.sh'
|
Загрузка…
Ссылка в новой задаче