Fix issues with python3 for spark, POSTGRES_HOST location

This commit is contained in:
Anthony Miyaguchi 2021-01-28 16:47:48 -08:00
Родитель 4dd3769abf
Коммит 4884bcb46f
7 изменённых файлов: 17 добавлений и 18 удалений

Просмотреть файл

@ -1,3 +1,5 @@
CLOUDSDK_CONFIG=~/.config/gcloud
# See https://github.com/mozilla-services/data-sandbox-terraform/tree/master/projects/etl-graph
CLOUDSDK_CORE_PROJECT=etl-graph
# See https://github.com/mozilla-services/data-sandbox-terraform/tree/master/projects/mozaggregator2bq
CLOUDSDK_CORE_PROJECT=mozaggregator2bq
# Set to the read-only host of the mozaggregator database
POSTGRES_HOST=

Просмотреть файл

@ -1,5 +1,8 @@
FROM gcr.io/google.com/cloudsdktool/cloud-sdk
RUN echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \
curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add -
RUN apt update && apt -y install jq postgresql
WORKDIR /app

Просмотреть файл

@ -2,10 +2,9 @@
set -e
export PROJECT="mozaggregator2bq"
export DATASET="aggregates"
export DATA_DIR="data"
START_DS=${START_DS?"START_DS must be set"}
END_DS=${END_DS?"END_DS must be set"}
START_DS=${START_DS?"must be set"}
END_DS=${END_DS?"must be set"}
function to_ds {
DS_NODASH=$1 python3 - <<EOD
@ -66,7 +65,7 @@ EOD
cd "$(dirname "$0")/.."
# checking if spark is enabled
python -c "import pyspark; print(pyspark.__path__[0])"
python3 -c "import pyspark; print(pyspark.__path__[0])"
# checking if credentials are set, check export_credentials_s3 for full list
: "${POSTGRES_USER?'POSTGRES_USER not set'}"
@ -78,11 +77,6 @@ function cleanup {
trap cleanup EXIT
gcloud config set project $PROJECT
if ! bq ls $PROJECT:$DATASET; then
bq mk $PROJECT:$DATASET
fi
mkdir -p "$DATA_DIR"
for ds_nodash in $(ds_nodash_range "$START_DS" "$END_DS"); do
time run_day "submission_date" "$ds_nodash"

Просмотреть файл

@ -3,9 +3,10 @@
set -x
SPARK_HOME=$(python -c "import pyspark; print(pyspark.__path__[0])") \
PYSPARK_DRIVER_PYTHON=jupyter \
PYSPARK_DRIVER_PYTHON_OPTS=notebook \
export PYSPARK_PYTHON=python3
export SPARK_HOME=$(python3 -c "import pyspark; print(pyspark.__path__[0])")
export PYSPARK_DRIVER_PYTHON=jupyter
export PYSPARK_DRIVER_PYTHON_OPTS=notebook
pyspark \
--master 'local[*]' \
--conf spark.driver.memory=8g \

Просмотреть файл

@ -2,7 +2,8 @@
set -x
SPARK_HOME=$(python -c "import pyspark; print(pyspark.__path__[0])") \
export PYSPARK_PYTHON=python3
export SPARK_HOME=$(python3 -c "import pyspark; print(pyspark.__path__[0])")
spark-submit \
--master 'local[*]' \
--conf spark.driver.memory=8g \

Просмотреть файл

@ -12,6 +12,7 @@ services:
environment:
- CLOUDSDK_CONFIG=/tmp/.config/gcloud
- CLOUDSDK_CORE_PROJECT
- POSTGRES_HOST
# ensure that the project can access the credentials bucket
- AWS_SECRET_ACCESS_KEY
- AWS_ACCESS_KEY_ID

Просмотреть файл

@ -11,6 +11,3 @@ function extract() {
export POSTGRES_DB="$(extract POSTGRES_DB)"
export POSTGRES_USER="$(extract POSTGRES_USER)"
export POSTGRES_PASS="$(extract POSTGRES_PASS)"
# Use the read-only replica
export POSTGRES_HOST="$(extract POSTGRES_RO_HOST)"