diff --git a/aztk/spark/helpers/create_cluster.py b/aztk/spark/helpers/create_cluster.py index 7bb3f137..aa2bd128 100644 --- a/aztk/spark/helpers/create_cluster.py +++ b/aztk/spark/helpers/create_cluster.py @@ -54,6 +54,7 @@ def __docker_run_cmd(docker_repo: str = None, gpu_enabled: bool = False, file_mo cmd.add_option('-e', 'SPARK_JOB_UI_PORT=$SPARK_JOB_UI_PORT') cmd.add_option('-p', '8080:8080') # Spark Master UI cmd.add_option('-p', '7077:7077') # Spark Master + cmd.add_option('-p', '7337:7337') # Spark Shuffle Service cmd.add_option('-p', '4040:4040') # Job UI cmd.add_option('-p', '8888:8888') # Jupyter UI cmd.add_option('-p', '8787:8787') # Rstudio Server diff --git a/config/AppRegistrations_2.png b/config/AppRegistrations_2.png deleted file mode 100644 index b9e61db0..00000000 Binary files a/config/AppRegistrations_2.png and /dev/null differ diff --git a/config/spark-defaults.conf b/config/spark-defaults.conf index 415df8dc..683b41aa 100644 --- a/config/spark-defaults.conf +++ b/config/spark-defaults.conf @@ -25,9 +25,14 @@ # spark.driver.memory 5g # spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" +# set "scratch" space for Spark +spark.local.dir /mnt/batch/ + # Note: Aztk pre-loads wasb jars, so loading is not necessary spark.jars /home/spark-current/jars/azure-storage-2.0.0.jar,/home/spark-current/jars/hadoop-azure-2.7.3.jar # Note: Default filesystem master HA spark.deploy.recoveryMode FILESYSTEM spark.deploy.recoveryDirectory /root/ + +spark.shuffle.service.enabled true diff --git a/node_scripts/install/install.py b/node_scripts/install/install.py index 3c0ef495..f011f1c4 100644 --- a/node_scripts/install/install.py +++ b/node_scripts/install/install.py @@ -29,7 +29,7 @@ def setup_node(): else: setup_as_worker() scripts.run_custom_scripts(is_master=False, is_worker=True) - + open("/tmp/setup_complete", 'a').close() diff --git a/node_scripts/install/spark.py b/node_scripts/install/spark.py index 71805d12..11ed2342 100644 --- a/node_scripts/install/spark.py +++ b/node_scripts/install/spark.py @@ -114,6 +114,15 @@ def start_spark_worker(): print("Connecting to master with '{0}'".format(" ".join(cmd))) call(cmd) + # enable the shuffle service on all slaves + start_shuffle_service() + + +def start_shuffle_service(): + exe = os.path.join(spark_home, "sbin", "start-shuffle-service.sh") + print("Starting the shuffle service with {}".format(exe)) + call([exe, " &"]) + def copyfile(src, dest): try: