зеркало из https://github.com/Azure/aztk.git
Feature: Spark retry docker pull (#672)
* retry docker pulls * change order of pool, job, storage creation to reduce conflicts * add error message on docker-compose curl failure
This commit is contained in:
Родитель
9e32b4b57b
Коммит
18b74e47d5
|
@ -22,12 +22,9 @@ def create_pool_and_job_and_table(
|
||||||
:param VmImageModel: the type of image to provision for the cluster
|
:param VmImageModel: the type of image to provision for the cluster
|
||||||
:param wait: wait until the cluster is ready
|
:param wait: wait until the cluster is ready
|
||||||
"""
|
"""
|
||||||
# update storage with the necessary values
|
# save cluster configuration in storage
|
||||||
core_cluster_operations.get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf)
|
core_cluster_operations.get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf)
|
||||||
|
|
||||||
if cluster_conf.scheduling_target != models.SchedulingTarget.Any:
|
|
||||||
core_cluster_operations.create_task_table(cluster_conf.cluster_id)
|
|
||||||
|
|
||||||
# reuse pool_id as job_id
|
# reuse pool_id as job_id
|
||||||
pool_id = cluster_conf.cluster_id
|
pool_id = cluster_conf.cluster_id
|
||||||
job_id = cluster_conf.cluster_id
|
job_id = cluster_conf.cluster_id
|
||||||
|
@ -71,4 +68,8 @@ def create_pool_and_job_and_table(
|
||||||
# Add job to batch
|
# Add job to batch
|
||||||
core_cluster_operations.batch_client.job.add(job)
|
core_cluster_operations.batch_client.job.add(job)
|
||||||
|
|
||||||
|
# create storage task table
|
||||||
|
if cluster_conf.scheduling_target != models.SchedulingTarget.Any:
|
||||||
|
core_cluster_operations.create_task_table(cluster_conf.cluster_id)
|
||||||
|
|
||||||
return helpers.get_cluster(cluster_conf.cluster_id, core_cluster_operations.batch_client)
|
return helpers.get_cluster(cluster_conf.cluster_id, core_cluster_operations.batch_client)
|
||||||
|
|
|
@ -29,9 +29,6 @@ def submit_job(
|
||||||
core_job_operations.get_cluster_data(job_configuration.id).save_cluster_config(
|
core_job_operations.get_cluster_data(job_configuration.id).save_cluster_config(
|
||||||
job_configuration.to_cluster_config())
|
job_configuration.to_cluster_config())
|
||||||
|
|
||||||
if job_configuration.scheduling_target != models.SchedulingTarget.Any:
|
|
||||||
core_job_operations.create_task_table(job_configuration.id)
|
|
||||||
|
|
||||||
# get a verified node agent sku
|
# get a verified node agent sku
|
||||||
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
|
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
|
||||||
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, core_job_operations.batch_client)
|
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, core_job_operations.batch_client)
|
||||||
|
@ -84,4 +81,7 @@ def submit_job(
|
||||||
|
|
||||||
core_job_operations.batch_client.job_schedule.add(setup)
|
core_job_operations.batch_client.job_schedule.add(setup)
|
||||||
|
|
||||||
|
if job_configuration.scheduling_target != models.SchedulingTarget.Any:
|
||||||
|
core_job_operations.create_task_table(job_configuration.id)
|
||||||
|
|
||||||
return core_job_operations.batch_client.job_schedule.get(job_schedule_id=job_configuration.id)
|
return core_job_operations.batch_client.job_schedule.get(job_schedule_id=job_configuration.id)
|
||||||
|
|
|
@ -42,8 +42,11 @@ install_prerequisites () {
|
||||||
|
|
||||||
install_docker_compose () {
|
install_docker_compose () {
|
||||||
echo "Installing Docker-Compose"
|
echo "Installing Docker-Compose"
|
||||||
|
url=https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m`
|
||||||
for i in {1..5}; do
|
for i in {1..5}; do
|
||||||
sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose && break || sleep 2;
|
sudo curl -L $url -o /usr/local/bin/docker-compose && break ||
|
||||||
|
echo "ERROR: failed to download docker-compose ... retrying in $($i**2) seconds" &&
|
||||||
|
sleep $i**2;
|
||||||
done
|
done
|
||||||
sudo chmod +x /usr/local/bin/docker-compose
|
sudo chmod +x /usr/local/bin/docker-compose
|
||||||
echo "Finished installing Docker-Compose"
|
echo "Finished installing Docker-Compose"
|
||||||
|
@ -59,7 +62,12 @@ pull_docker_container () {
|
||||||
docker login $DOCKER_ENDPOINT --username $DOCKER_USERNAME --password $DOCKER_PASSWORD
|
docker login $DOCKER_ENDPOINT --username $DOCKER_USERNAME --password $DOCKER_PASSWORD
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker pull $docker_repo_name
|
|
||||||
|
for i in {1..5}; do
|
||||||
|
docker pull $docker_repo_name && break ||
|
||||||
|
echo "ERROR: docker pull $docker_repo_name failed ... retrying after $($i**2) seconds" &&
|
||||||
|
sleep $i**2;
|
||||||
|
done
|
||||||
echo "Finished pulling $docker_repo_name"
|
echo "Finished pulling $docker_repo_name"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче