Feature: Spark retry docker pull (#672)

* retry docker pulls

* change order of pool, job, storage creation to reduce conflicts

* add error message on docker-compose curl failure
This commit is contained in:
Jacob Freck 2018-10-24 16:49:02 -07:00 коммит произвёл GitHub
Родитель 9e32b4b57b
Коммит 18b74e47d5
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 19 добавлений и 10 удалений

Просмотреть файл

@ -22,12 +22,9 @@ def create_pool_and_job_and_table(
:param VmImageModel: the type of image to provision for the cluster :param VmImageModel: the type of image to provision for the cluster
:param wait: wait until the cluster is ready :param wait: wait until the cluster is ready
""" """
# update storage with the necessary values # save cluster configuration in storage
core_cluster_operations.get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf) core_cluster_operations.get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf)
if cluster_conf.scheduling_target != models.SchedulingTarget.Any:
core_cluster_operations.create_task_table(cluster_conf.cluster_id)
# reuse pool_id as job_id # reuse pool_id as job_id
pool_id = cluster_conf.cluster_id pool_id = cluster_conf.cluster_id
job_id = cluster_conf.cluster_id job_id = cluster_conf.cluster_id
@ -71,4 +68,8 @@ def create_pool_and_job_and_table(
# Add job to batch # Add job to batch
core_cluster_operations.batch_client.job.add(job) core_cluster_operations.batch_client.job.add(job)
# create storage task table
if cluster_conf.scheduling_target != models.SchedulingTarget.Any:
core_cluster_operations.create_task_table(cluster_conf.cluster_id)
return helpers.get_cluster(cluster_conf.cluster_id, core_cluster_operations.batch_client) return helpers.get_cluster(cluster_conf.cluster_id, core_cluster_operations.batch_client)

Просмотреть файл

@ -29,9 +29,6 @@ def submit_job(
core_job_operations.get_cluster_data(job_configuration.id).save_cluster_config( core_job_operations.get_cluster_data(job_configuration.id).save_cluster_config(
job_configuration.to_cluster_config()) job_configuration.to_cluster_config())
if job_configuration.scheduling_target != models.SchedulingTarget.Any:
core_job_operations.create_task_table(job_configuration.id)
# get a verified node agent sku # get a verified node agent sku
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku( sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, core_job_operations.batch_client) vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, core_job_operations.batch_client)
@ -84,4 +81,7 @@ def submit_job(
core_job_operations.batch_client.job_schedule.add(setup) core_job_operations.batch_client.job_schedule.add(setup)
if job_configuration.scheduling_target != models.SchedulingTarget.Any:
core_job_operations.create_task_table(job_configuration.id)
return core_job_operations.batch_client.job_schedule.get(job_schedule_id=job_configuration.id) return core_job_operations.batch_client.job_schedule.get(job_schedule_id=job_configuration.id)

Просмотреть файл

@ -42,8 +42,11 @@ install_prerequisites () {
install_docker_compose () { install_docker_compose () {
echo "Installing Docker-Compose" echo "Installing Docker-Compose"
for i in {1..5}; do url=https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m`
sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose && break || sleep 2; for i in {1..5}; do
sudo curl -L $url -o /usr/local/bin/docker-compose && break ||
echo "ERROR: failed to download docker-compose ... retrying in $($i**2) seconds" &&
sleep $i**2;
done done
sudo chmod +x /usr/local/bin/docker-compose sudo chmod +x /usr/local/bin/docker-compose
echo "Finished installing Docker-Compose" echo "Finished installing Docker-Compose"
@ -59,7 +62,12 @@ pull_docker_container () {
docker login $DOCKER_ENDPOINT --username $DOCKER_USERNAME --password $DOCKER_PASSWORD docker login $DOCKER_ENDPOINT --username $DOCKER_USERNAME --password $DOCKER_PASSWORD
fi fi
docker pull $docker_repo_name
for i in {1..5}; do
docker pull $docker_repo_name && break ||
echo "ERROR: docker pull $docker_repo_name failed ... retrying after $($i**2) seconds" &&
sleep $i**2;
done
echo "Finished pulling $docker_repo_name" echo "Finished pulling $docker_repo_name"
} }