зеркало из https://github.com/Azure/aztk.git
Feature: Spark retry docker pull (#672)
* retry docker pulls * change order of pool, job, storage creation to reduce conflicts * add error message on docker-compose curl failure
This commit is contained in:
Родитель
9e32b4b57b
Коммит
18b74e47d5
|
@ -22,12 +22,9 @@ def create_pool_and_job_and_table(
|
|||
:param VmImageModel: the type of image to provision for the cluster
|
||||
:param wait: wait until the cluster is ready
|
||||
"""
|
||||
# update storage with the necessary values
|
||||
# save cluster configuration in storage
|
||||
core_cluster_operations.get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf)
|
||||
|
||||
if cluster_conf.scheduling_target != models.SchedulingTarget.Any:
|
||||
core_cluster_operations.create_task_table(cluster_conf.cluster_id)
|
||||
|
||||
# reuse pool_id as job_id
|
||||
pool_id = cluster_conf.cluster_id
|
||||
job_id = cluster_conf.cluster_id
|
||||
|
@ -71,4 +68,8 @@ def create_pool_and_job_and_table(
|
|||
# Add job to batch
|
||||
core_cluster_operations.batch_client.job.add(job)
|
||||
|
||||
# create storage task table
|
||||
if cluster_conf.scheduling_target != models.SchedulingTarget.Any:
|
||||
core_cluster_operations.create_task_table(cluster_conf.cluster_id)
|
||||
|
||||
return helpers.get_cluster(cluster_conf.cluster_id, core_cluster_operations.batch_client)
|
||||
|
|
|
@ -29,9 +29,6 @@ def submit_job(
|
|||
core_job_operations.get_cluster_data(job_configuration.id).save_cluster_config(
|
||||
job_configuration.to_cluster_config())
|
||||
|
||||
if job_configuration.scheduling_target != models.SchedulingTarget.Any:
|
||||
core_job_operations.create_task_table(job_configuration.id)
|
||||
|
||||
# get a verified node agent sku
|
||||
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
|
||||
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, core_job_operations.batch_client)
|
||||
|
@ -84,4 +81,7 @@ def submit_job(
|
|||
|
||||
core_job_operations.batch_client.job_schedule.add(setup)
|
||||
|
||||
if job_configuration.scheduling_target != models.SchedulingTarget.Any:
|
||||
core_job_operations.create_task_table(job_configuration.id)
|
||||
|
||||
return core_job_operations.batch_client.job_schedule.get(job_schedule_id=job_configuration.id)
|
||||
|
|
|
@ -42,8 +42,11 @@ install_prerequisites () {
|
|||
|
||||
install_docker_compose () {
|
||||
echo "Installing Docker-Compose"
|
||||
for i in {1..5}; do
|
||||
sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose && break || sleep 2;
|
||||
url=https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m`
|
||||
for i in {1..5}; do
|
||||
sudo curl -L $url -o /usr/local/bin/docker-compose && break ||
|
||||
echo "ERROR: failed to download docker-compose ... retrying in $($i**2) seconds" &&
|
||||
sleep $i**2;
|
||||
done
|
||||
sudo chmod +x /usr/local/bin/docker-compose
|
||||
echo "Finished installing Docker-Compose"
|
||||
|
@ -59,7 +62,12 @@ pull_docker_container () {
|
|||
docker login $DOCKER_ENDPOINT --username $DOCKER_USERNAME --password $DOCKER_PASSWORD
|
||||
fi
|
||||
|
||||
docker pull $docker_repo_name
|
||||
|
||||
for i in {1..5}; do
|
||||
docker pull $docker_repo_name && break ||
|
||||
echo "ERROR: docker pull $docker_repo_name failed ... retrying after $($i**2) seconds" &&
|
||||
sleep $i**2;
|
||||
done
|
||||
echo "Finished pulling $docker_repo_name"
|
||||
}
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче