Fix issues with python3 for spark, POSTGRES_HOST location
This commit is contained in:
Родитель
4dd3769abf
Коммит
4884bcb46f
|
@ -1,3 +1,5 @@
|
|||
CLOUDSDK_CONFIG=~/.config/gcloud
|
||||
# See https://github.com/mozilla-services/data-sandbox-terraform/tree/master/projects/etl-graph
|
||||
CLOUDSDK_CORE_PROJECT=etl-graph
|
||||
# See https://github.com/mozilla-services/data-sandbox-terraform/tree/master/projects/mozaggregator2bq
|
||||
CLOUDSDK_CORE_PROJECT=mozaggregator2bq
|
||||
# Set to the read-only host of the mozaggregator database
|
||||
POSTGRES_HOST=
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
FROM gcr.io/google.com/cloudsdktool/cloud-sdk
|
||||
|
||||
RUN echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \
|
||||
curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add -
|
||||
|
||||
RUN apt update && apt -y install jq postgresql
|
||||
|
||||
WORKDIR /app
|
||||
|
|
12
bin/backfill
12
bin/backfill
|
@ -2,10 +2,9 @@
|
|||
set -e
|
||||
|
||||
export PROJECT="mozaggregator2bq"
|
||||
export DATASET="aggregates"
|
||||
export DATA_DIR="data"
|
||||
START_DS=${START_DS?"START_DS must be set"}
|
||||
END_DS=${END_DS?"END_DS must be set"}
|
||||
START_DS=${START_DS?"must be set"}
|
||||
END_DS=${END_DS?"must be set"}
|
||||
|
||||
function to_ds {
|
||||
DS_NODASH=$1 python3 - <<EOD
|
||||
|
@ -66,7 +65,7 @@ EOD
|
|||
cd "$(dirname "$0")/.."
|
||||
|
||||
# checking if spark is enabled
|
||||
python -c "import pyspark; print(pyspark.__path__[0])"
|
||||
python3 -c "import pyspark; print(pyspark.__path__[0])"
|
||||
|
||||
# checking if credentials are set, check export_credentials_s3 for full list
|
||||
: "${POSTGRES_USER?'POSTGRES_USER not set'}"
|
||||
|
@ -78,11 +77,6 @@ function cleanup {
|
|||
trap cleanup EXIT
|
||||
gcloud config set project $PROJECT
|
||||
|
||||
if ! bq ls $PROJECT:$DATASET; then
|
||||
bq mk $PROJECT:$DATASET
|
||||
fi
|
||||
|
||||
|
||||
mkdir -p "$DATA_DIR"
|
||||
for ds_nodash in $(ds_nodash_range "$START_DS" "$END_DS"); do
|
||||
time run_day "submission_date" "$ds_nodash"
|
||||
|
|
|
@ -3,9 +3,10 @@
|
|||
|
||||
set -x
|
||||
|
||||
SPARK_HOME=$(python -c "import pyspark; print(pyspark.__path__[0])") \
|
||||
PYSPARK_DRIVER_PYTHON=jupyter \
|
||||
PYSPARK_DRIVER_PYTHON_OPTS=notebook \
|
||||
export PYSPARK_PYTHON=python3
|
||||
export SPARK_HOME=$(python3 -c "import pyspark; print(pyspark.__path__[0])")
|
||||
export PYSPARK_DRIVER_PYTHON=jupyter
|
||||
export PYSPARK_DRIVER_PYTHON_OPTS=notebook
|
||||
pyspark \
|
||||
--master 'local[*]' \
|
||||
--conf spark.driver.memory=8g \
|
||||
|
|
|
@ -2,7 +2,8 @@
|
|||
|
||||
set -x
|
||||
|
||||
SPARK_HOME=$(python -c "import pyspark; print(pyspark.__path__[0])") \
|
||||
export PYSPARK_PYTHON=python3
|
||||
export SPARK_HOME=$(python3 -c "import pyspark; print(pyspark.__path__[0])")
|
||||
spark-submit \
|
||||
--master 'local[*]' \
|
||||
--conf spark.driver.memory=8g \
|
||||
|
|
|
@ -12,6 +12,7 @@ services:
|
|||
environment:
|
||||
- CLOUDSDK_CONFIG=/tmp/.config/gcloud
|
||||
- CLOUDSDK_CORE_PROJECT
|
||||
- POSTGRES_HOST
|
||||
# ensure that the project can access the credentials bucket
|
||||
- AWS_SECRET_ACCESS_KEY
|
||||
- AWS_ACCESS_KEY_ID
|
||||
|
|
|
@ -11,6 +11,3 @@ function extract() {
|
|||
export POSTGRES_DB="$(extract POSTGRES_DB)"
|
||||
export POSTGRES_USER="$(extract POSTGRES_USER)"
|
||||
export POSTGRES_PASS="$(extract POSTGRES_PASS)"
|
||||
|
||||
# Use the read-only replica
|
||||
export POSTGRES_HOST="$(extract POSTGRES_RO_HOST)"
|
||||
|
|
Загрузка…
Ссылка в новой задаче