remove unhealthy compute and aks (#3329)

* remove unhealthy compute and aks

* remove unhealthy compute and aks

---------

Co-authored-by: Ubuntu <zetia@DevBox-zetia.1jltvvkrfgyuhl3llhmbyldkog.bx.internal.cloudapp.net>
This commit is contained in:
Zeliang Tian 2024-08-05 10:06:12 +08:00 коммит произвёл GitHub
Родитель 7aafd2760b
Коммит 92101a9728
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
1 изменённых файлов: 21 добавлений и 8 удалений

Просмотреть файл

@ -814,15 +814,28 @@ function ensure_k8s_compute(){
arc_compute=${ARC_CLUSTER_NAME}
echo_info "Creating amlarc cluster: '$arc_compute'"
# Check current state of AKS
provisioning_state=$(az aks show --resource-group "${RESOURCE_GROUP_NAME}" --name ${arc_compute} --query "provisioningState" -o tsv)
provisioning_state=${provisioning_state,,}
echo_info "AKS provisioning state: '$provisioning_state'"
if [[ $provisioning_state == "failed" ]]; then
echo_info "Remove unhealthy AKS: '$arc_compute' in '$provisioning_state'"
az aks delete --resource-group "${RESOURCE_GROUP_NAME}" --name ${arc_compute} --yes
# Remove AKS if unhealthy
compute_exists=$(az aks list --resource-group "${RESOURCE_GROUP_NAME}" --query "[?name == '${arc_compute}']" |tail -n1|tr -d "[:cntrl:]")
if [[ "${compute_exists}" = "[]" ]]; then
echo_info "AKS Compute ${arc_compute} does not exist; will create"
else
if ! AKS_CLUSTER_NAME=${arc_compute} MAX_RETRIES=10 SLEEP_SECONDS=30 check_aks_status; then
echo_info "Remove unhealthy AKS: '$arc_compute'"
az aks delete --resource-group "${RESOURCE_GROUP_NAME}" --name ${arc_compute} --yes
fi
fi
# Remove k8s compute if unhealthy
Status=$(az ml compute show --resource-group "${RESOURCE_GROUP_NAME}" --name "${ARC_COMPUTE_NAME}" --query provisioning_state --output tsv)
if
[[ $Status == "Succeeded" ]]
then
echo_info "Compute is healthy: $Status"
else
echo_info "Compute is unhealthy: $Status"
az ml compute detach --subscription "${SUBSCRIPTION_ID}" --resource-group "${RESOURCE_GROUP_NAME}" --workspace-name "${WORKSPACE_NAME}" --name "${ARC_COMPUTE_NAME}" -y || true
fi
LOCATION=eastus2 ensure_aks_compute "${arc_compute}" 1 3 "STANDARD_D3_V2"
install_k8s_extension "${arc_compute}" "connectedClusters" "Microsoft.Kubernetes/connectedClusters"
setup_compute "${arc_compute}-arc" "${ARC_COMPUTE_NAME}" "connectedClusters" "azureml"