azureml-examples/infra/bootstrapping/sdk_helpers.sh

874 строки
36 KiB
Bash

#!/bin/bash
set -eu
[ -n "${DEBUG:-}" ] && set -x
####################
# SET VARIABLES FOR CURRENT FILE & DIR
####################
# The filename of this script for help messages
SCRIPT_PATH="${BASH_SOURCE[0]:-$0}"
SCRIPT_DIR="$( cd "$( dirname "${SCRIPT_PATH}" )" && pwd )"
EPOCH_START="$( date -u +%s )" # e.g. 1661361223
declare -A SKIP_AUTO_DELETE_TILL=$(date -d "+31 days" +'%Y-%m-%d')
declare -a DELETE_AFTER=("31.00:00:00")
COMMON_TAGS=(
"cleanup:DeleteAfter=${DELETE_AFTER}"
"cleanup:Policy=DeleteAfter"
"creationTime=${EPOCH_START}"
"owner=azuremlsdk@microsoft.com"
"SkipAutoDeleteTill=${SKIP_AUTO_DELETE_TILL}"
"EnableAzSecPackIdentityPolicy=true"
)
####################
# SETUP LOGGING
####################
LOG_FILE="/tmp/$(basename "$0").log"
readonly LOG_FILE
DATE_FORMAT=${DATE_FORMAT:-'%Y-%m-%dT%H:%M:%S.%2N'}
readonly DATE_FORMAT
LOG_FORMAT='%s : %s : %s\n'
readonly LOG_FORMAT
echo_info() { printf "$LOG_FORMAT" [INFO] "$(date +"$DATE_FORMAT")" "$@" | tee -a "$LOG_FILE" >&2 ; }
echo_warning() { printf "$LOG_FORMAT" [WARNING] "$(date +"$DATE_FORMAT")" "$@" | tee -a "$LOG_FILE" >&2 ; }
echo_error() { printf "$LOG_FORMAT" [ERROR] "$(date +"$DATE_FORMAT")" "$@" | tee -a "$LOG_FILE" >&2 ; }
echo_fatal() { printf "$LOG_FORMAT" [FATAL] "$(date +"$DATE_FORMAT")" "$@" | tee -a "$LOG_FILE" >&2 ; exit 1 ; }
####################
# CUSTOM ECHO FUNCTIONS TO PRINT TEXT TO THE SCREEN
####################
echo_title() {
echo
echo "### ${1} ###"
}
echo_subtitle() {
echo "# ${1} #"
}
####################
# CUSTOM FUNCTIONS
####################
function pushd () {
command pushd "$@" 2>&1 > /dev/null || exit
}
function popd () {
command popd "$@" 2>&1 > /dev/null || exit
}
function ensure_registry(){
local LOCAL_REGISTRY_NAME="${1:-${REGISTRY_NAME:-}}"
registry_exists=$(az ml registry list --resource-group "${RESOURCE_GROUP_NAME}" --query "[?name == '$LOCAL_REGISTRY_NAME']" |tail -n1|tr -d "[:cntrl:]")
if [[ "${registry_exists}" = "[]" ]]; then
retry_times=0
while true
do
retry_times=$((retry_times+1))
ensure_registry_local
if [[ $? -ne 0 ]]; then
if [[ $retry_times -gt 9 ]]; then
echo_error "Failed to create registry after 10 retries"
exit 1
fi
continue
else
echo_info "registry ${LOCAL_REGISTRY_NAME} created successfully" >&2
break
fi
done
else
echo_warning "registry ${LOCAL_REGISTRY_NAME} already exist, skipping creation step..." >&2
fi
}
function ensure_registry_local(){
registry_exists=$(az ml registry list --resource-group "${RESOURCE_GROUP_NAME}" --query "[?name == '$LOCAL_REGISTRY_NAME']" |tail -n1|tr -d "[:cntrl:]")
if [[ "${registry_exists}" = "[]" ]]; then
echo_info "registry ${LOCAL_REGISTRY_NAME} does not exist; creating" >&2
sed -i "s/<REGISTRY-NAME>/$LOCAL_REGISTRY_NAME/" $SCRIPT_DIR/infra_resources/registry-demo.yml
sed -i "s/<LOCATION>/$LOCATION/" $SCRIPT_DIR/infra_resources/registry-demo.yml
cat $SCRIPT_DIR/infra_resources/registry-demo.yml
az ml registry create --resource-group $RESOURCE_GROUP_NAME --file $SCRIPT_DIR/infra_resources/registry-demo.yml --name $LOCAL_REGISTRY_NAME || echo "Failed to create registry $LOCAL_REGISTRY_NAME, will retry"
registry_exists=$(az ml registry list --resource-group "${RESOURCE_GROUP_NAME}" --query "[?name == '$LOCAL_REGISTRY_NAME']" |tail -n1|tr -d "[:cntrl:]")
if [[ "${registry_exists}" = "[]" ]]; then
echo_info "Retry creating registry ${LOCAL_REGISTRY_NAME}" >&2
sleep 30
return 1
fi
fi
return 0
}
function ensure_resourcegroup() {
rg_exists=$(az group exists --resource-group "$RESOURCE_GROUP_NAME" --output tsv |tail -n1|tr -d "[:cntrl:]")
if [ "false" = "$rg_exists" ]; then
echo_info "Resource group ${RESOURCE_GROUP_NAME} does not exist" >&2
echo_info "Resource group ${RESOURCE_GROUP_NAME} in location: ${LOCATION} does not exist; creating" >&2
az group create --name "${RESOURCE_GROUP_NAME}" --location "${LOCATION}" --tags "${COMMON_TAGS[@]}" > /dev/null 2>&1
if [[ $? -ne 0 ]]; then
echo_error "Failed to create resource group ${RESOURCE_GROUP_NAME}" >&2
else
echo_info "Resource group ${RESOURCE_GROUP_NAME} created successfully" >&2
fi
else
echo_warning "Resource group ${RESOURCE_GROUP_NAME} already exist, skipping creation step..." >&2
fi
}
function ensure_ml_workspace() {
local LOCAL_WORKSPACE_NAME="${1:-${WORKSPACE_NAME:-}}"
workspace_exists=$(az ml workspace list --resource-group "${RESOURCE_GROUP_NAME}" --query "[?name == '$LOCAL_WORKSPACE_NAME']" |tail -n1|tr -d "[:cntrl:]")
if [[ "${workspace_exists}" = "[]" ]]; then
echo_info "Workspace ${LOCAL_WORKSPACE_NAME} does not exist; creating" >&2
CREATE_WORKSPACE=$(az ml workspace create \
--name "${LOCAL_WORKSPACE_NAME}" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--location "${LOCATION}" \
--tags "${COMMON_TAGS[@]}" \
--query id --output tsv \
> /dev/null 2>&1)
if [[ $? -ne 0 ]]; then
echo_error "Failed to create workspace ${LOCAL_WORKSPACE_NAME}" >&2
echo "[---fail---] $CREATE_WORKSPACE."
else
echo_info "Workspace ${LOCAL_WORKSPACE_NAME} created successfully" >&2
# ensure_prerequisites_in_workspace
fi
else
echo_warning "Workspace ${LOCAL_WORKSPACE_NAME} already exist, skipping creation step..." >&2
fi
}
function ensure_aml_compute() {
COMPUTE_NAME=${1:-cpu-cluster}
MIN_INSTANCES=${2:-0}
MAX_INSTANCES=${3:-2}
COMPUTE_SIZE=${4:-Standard_DS3_v2}
compute_exists=$(az ml compute list --resource-group "${RESOURCE_GROUP_NAME}" --query "[?name == '$COMPUTE_NAME']" | tail -n1 | tr -d "[:cntrl:]")
if [[ "${compute_exists}" = "[]" ]]; then
echo_info "Compute ${COMPUTE_NAME} does not exist; creating" >&2
CREATE_COMPUTE=$(az ml compute create \
--name "${COMPUTE_NAME}" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--type amlcompute --min-instances "${MIN_INSTANCES}" --max-instances "${MAX_INSTANCES}" \
--size "${COMPUTE_SIZE}" \
--output tsv \
> /dev/null)
if [[ $? -ne 0 ]]; then
echo_error "Failed to create compute ${COMPUTE_NAME}" >&2
echo "[---fail---] $CREATE_COMPUTE."
else
echo_info "Compute ${COMPUTE_NAME} created successfully" >&2
fi
else
echo_warning "Compute ${COMPUTE_NAME} already exist, skipping creation step..." >&2
fi
}
function grant_permission_app_id_on_rg() {
local SERVICE_PRINCIPAL_NAME="${1:-APP_NAME}"
servicePrincipalAppId=$(az ad sp list --display-name "${SERVICE_PRINCIPAL_NAME}" --query "[].appId" -o tsv | tail -n1 | tr -d "[:cntrl:]")
RESOURCE_GROUP_ID=$(az group show --name "${RESOURCE_GROUP_NAME}" --query id -o tsv | tail -n1 | tr -d "[:cntrl:]")
cmd="az role assignment create --role 'Storage Blob Data Owner' --assignee $servicePrincipalAppId --scope $RESOURCE_GROUP_ID"
eval "$cmd"
}
function grant_permission_identity_on_acr() {
local IDENTITY_NAME="${1:-identity}"
Id=$(az identity list --query "[?name=='$IDENTITY_NAME'].principalId" -o tsv)
if [[ -z $Id ]]; then
echo_warning "Managed Identity: $IDENTITY_NAME does not exists."
fi
az role assignment create --role "Contributor" --assignee-object-id "$Id" --assignee-principal-type ServicePrincipal &> /dev/null
az role assignment create --role "AcrPull" --assignee-object-id "$Id" --assignee-principal-type ServicePrincipal &> /dev/null
}
function ensure_vnet() {
local VNET_NAME="${1:-vnetName}"
local VNET_CIDR="${2:-${VNET_CIDR:-}}"
vnet_exists=$(az network vnet list --resource-group "${RESOURCE_GROUP_NAME}" --query "[?name == '$VNET_NAME']" | tail -n1 | tr -d "[:cntrl:]")
if [[ "${vnet_exists}" = "[]" ]]; then
echo_info "creating $VNET_NAME vnet "
az network vnet create --name "$VNET_NAME" --address-prefixes "$VNET_CIDR" > /dev/null
echo_info "vnet $VNET_NAME creation completed"
else
echo_warning "vnet $VNET_NAME already exists. reusing pre-created one"
fi
}
function ensure_subnet() {
local VNET_NAME="${1:-vnetName}"
local MASTER_SUBNET_NAME="${2:-mastersubnet}"
local MASTER_SUBNET="${3:-${MASTER_SUBNET:-}}"
subnet_exists=$(az network vnet subnet list --resource-group "${RESOURCE_GROUP_NAME}" --vnet-name "$VNET_NAME" --query "[?name == '$MASTER_SUBNET_NAME']" | tail -n1 | tr -d "[:cntrl:]")
if [[ "${subnet_exists}" = "[]" ]]; then
echo_info "creating master subnet: $MASTER_SUBNET_NAME"
az network vnet subnet create --vnet-name "$VNET_NAME" --name "$MASTER_SUBNET_NAME" --address-prefixes "$MASTER_SUBNET" > /dev/null
echo_info "subnet $MASTER_SUBNET_NAME creation completed"
else
echo_warning "subnet $MASTER_SUBNET_NAME already exists. reusing pre-created one"
fi
}
function ensure_identity() {
local IDENTITY_NAME="${1:-identityname}"
IDENTITY_ID=$(az identity list --query "[?name=='$IDENTITY_NAME'].principalId" -o tsv)
if [[ -z $IDENTITY_ID ]]; then
echo_info "Creating Managed Identity: $IDENTITY_NAME "
IDENTITY_ID=$(az identity create -n "$IDENTITY_NAME" --query 'principalId' -o tsv | tail -n1 | tr -d "[:cntrl:]")
echo_info "Managed Identity: $IDENTITY_NAME creation completed"
else
echo_warning "Managed Identity: $IDENTITY_NAME already exists. reusing pre-created one"
fi
RESOURCE_GROUP_ID=$(az group show --name "${RESOURCE_GROUP_NAME}" --query id -o tsv | tail -n1 | tr -d "[:cntrl:]")
IDENTITY_ID=$(az identity create -n "$IDENTITY_NAME" --query 'principalId' -o tsv | tail -n1 | tr -d "[:cntrl:]")
cmd="az role assignment create --role 'Contributor' --assignee $IDENTITY_ID --scope $RESOURCE_GROUP_ID"
eval "$cmd"
cmd="az role assignment create --role 'AcrPull' --assignee $IDENTITY_ID --scope $RESOURCE_GROUP_ID"
eval "$cmd"
}
function install_azcopy() {
echo_info "Installing AzCopy" >&2
# Download and extract
wget https://aka.ms/downloadazcopy-v10-linux
tar -xvf downloadazcopy-v10-linux
# Move AzCopy
sudo rm -f /usr/bin/azcopy
sudo cp ./azcopy_linux_amd64_*/azcopy /usr/bin/
sudo chmod 755 /usr/bin/azcopy
rm -f downloadazcopy-v10-linux
rm -rf ./azcopy_linux_amd64_*/
echo "Testing azcopy call."
if ! command -v azcopy; then
echo "azcopy was not installed"
exit 1
fi
}
function IsInstalled {
sudo dpkg -S "$1" &> /dev/null
}
function add_extension() {
echo_info "az extension add -n $1 "
az extension add -n "$1" -y
}
function ensure_extension() {
echo_info "az extension $1 version check ... "
EXT_VERSION=$( az extension list -o table --query "[?contains(name, '$1')].{Version:version}" -o tsv |tail -n1|tr -d "[:cntrl:]")
if [[ -z "${EXT_VERSION}" ]]; then
echo_info "az extension \"$1\" not found."
add_extension "$1"
else
echo_info "Remove az extionsion '$1' version ${EXT_VERSION}"
# Per https://docs.microsoft.com/azure/machine-learning/how-to-configure-cli
az extension remove -n "$1"
echo_info "Add latest az extionsion \"$1\":"
add_extension "$1"
fi
}
function ensure_prerequisites_in_registry() {
echo_info "Ensuring prerequisites in the registry" >&2
deploy_scripts=(
"$SCRIPT_DIR/create-registry-components.sh"
)
for script in "${deploy_scripts[@]}"; do
echo_info "Deploying '$script'"
if [ -f "$script" ]; then
bash "$script" ${REGISTRY_NAME};
else
echo_error "$script not found."
fi
done
}
function ensure_prerequisites_in_workspace() {
echo_info "Ensuring prerequisites in the workspace" >&2
deploy_scripts=(
# "$SCRIPT_DIR/copy-data.sh"
"$SCRIPT_DIR/create-datasets.sh"
# "$SCRIPT_DIR/update-datasets.sh"
"$SCRIPT_DIR/create-components.sh"
"$SCRIPT_DIR/create-environments.sh"
)
for script in "${deploy_scripts[@]}"; do
echo_info "Deploying '$script'"
if [ -f "$script" ]; then
bash "$script";
else
echo_error "$script not found."
fi
done
}
function update_dataset() {
echo_info "Updating dataset in the workspace" >&2
deploy_scripts=(
"$SCRIPT_DIR/update-datasets.sh"
)
for script in "${deploy_scripts[@]}"; do
echo_info "Deploying '$script'"
if [ -f "$script" ]; then
bash "$script";
else
echo_error "$script not found."
fi
done
}
function copy_dataset() {
echo_info "Copying dataset in the workspace" >&2
deploy_scripts=(
"$SCRIPT_DIR/copy-data.sh"
)
for script in "${deploy_scripts[@]}"; do
echo_info "Executing '$script'"
if [ -f "$script" ]; then
bash "$script";
else
echo_error "$script not found."
fi
done
}
function register_az_provider {
namespace_name=$1
RESPONSE=$( az provider show --namespace "$namespace_name" --query registrationState -o tsv |tail -n1|tr -d "[:cntrl:]")
if [ "$RESPONSE" == "Registered" ]; then
echo_info ">>> $namespace_name already Registered."
else
az provider register -n "$namespace_name"
echo_info ">>> Provider \"$namespace_name\" registered for subscription."
fi
}
register_providers(){
provider_list=(
"Microsoft.Storage"
# For aks
"Microsoft.ContainerService"
# For arc
"Microsoft.Kubernetes"
# For amlarc extension
"Microsoft.Relay"
"Microsoft.KubernetesConfiguration"
)
for provider in "${provider_list[@]}"; do
register_az_provider "${provider}"
done
# Feature register: enables installing the add-on
feature_registerd=$(az feature show --namespace Microsoft.ContainerService --name AKS-ExtensionManager --query properties.state |tail -n1|tr -d "[:cntrl:]")
if test "$feature_registerd" != \"Registered\"
then
az feature register --namespace Microsoft.ContainerService --name AKS-ExtensionManager
else
echo_info ">>> Microsoft.ContainerService AKS-ExtensionManager already registered"
fi
while test "$feature_registerd" != \"Registered\"
do
sleep 10;
feature_registerd=$(az feature show --namespace Microsoft.ContainerService --name AKS-ExtensionManager --query properties.state |tail -n1|tr -d "[:cntrl:]")
done
}
install_tools(){
# az upgrade --all --yes
echo_info "Ensuring az extension on the machine." >&2
add_extension=(
# Arc extentions
connectedk8s
k8s-extension
# ML Extension
ml
)
for extension_name in "${add_extension[@]}"; do
echo_info "Ensuring extension '${extension_name}'"
ensure_extension "${extension_name}"
done
curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl \
&& chmod +x ./kubectl \
&& sudo mv ./kubectl /usr/local/bin/kubectl \
&& az version
}
# get AKS credentials
get_kubeconfig(){
local AKS_CLUSTER_NAME="${1:-aks-cluster}"
az aks get-credentials \
--subscription "${SUBSCRIPTION_ID}" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--name "${AKS_CLUSTER_NAME}" \
--overwrite-existing
kubectl get ns
echo_info "AKS credentials retrieved for the cluster:${AKS_CLUSTER_NAME}"
}
check_arc_status(){
local ARC_CLUSTER_NAME="${1:-aks-cluster}"
for i in $(seq 1 "$MAX_RETRIES"); do
connectivityStatus=$(az connectedk8s show \
--subscription "${SUBSCRIPTION_ID}" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--name "$ARC_CLUSTER_NAME" \
--query connectivityStatus -o tsv | tail -n1 | tr -d "[:cntrl:]")
echo_info "connectivityStatus: $connectivityStatus"
if [[ $connectivityStatus != "Connected" ]]; then
sleep "${SLEEP_SECONDS}"
else
break
fi
done
[[ $connectivityStatus == "Connected" ]]
CONNECTED_CLUSTER_ID=$(az connectedk8s show -n "${ARC_CLUSTER_NAME}" -g "${RESOURCE_GROUP_NAME}" --query id -o tsv)
# echo_info "Connected to ARC Cluster Id: ${CONNECTED_CLUSTER_ID}..."
}
# connect cluster to ARC
connect_arc(){
local AKS_CLUSTER_NAME="${1:-aks-cluster}"
local ARC_CLUSTER_NAME="${2:-arc-cluster}" # Name of the connected cluster resource
echo_info "Connecting to the existing K8s cluster by installing ARC agent..."
# the existing K8s cluster is determined by the contents of the kubeconfig file
# get aks kubeconfig
get_kubeconfig "$AKS_CLUSTER_NAME"
if
[[ $(az connectedk8s show --resource-group "${RESOURCE_GROUP_NAME}" --name "${ARC_CLUSTER_NAME}" --query name --output tsv) == ${ARC_CLUSTER_NAME} ]]
then
echo_info "Cluster: ${ARC_CLUSTER_NAME} is already connected..."
clusterState=$(az connectedk8s show --resource-group "${RESOURCE_GROUP_NAME}" --name "${ARC_CLUSTER_NAME}" --query connectivityStatus -o json)
clusterState=$(echo "$clusterState" | tr -d '"' | tr -d '"\r\n')
echo_info "Cluster: ${ARC_CLUSTER_NAME} current state: ${clusterState}"
else
echo -e "Connecting Azure via Azure Arc for Cluster: ${ARC_CLUSTER_NAME}"
# attach/onboard the cluster to Arc
$(az connectedk8s connect \
--subscription "${SUBSCRIPTION_ID}" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--location "$LOCATION" \
--name "$ARC_CLUSTER_NAME" --no-wait \
--output tsv \
> /dev/null 2>&1 )
echo -e "Azure Arc cluster created: ${ARC_CLUSTER_NAME}"
fi
check_arc_status "${ARC_CLUSTER_NAME}"
}
function setup_compute() {
echo_info "Attaching Kubernetes Compute"
local CLUSTER_NAME="${1:-aks-cluster}"
local COMPUTE_NAME="${2:-aks-compute}"
local CLUSTER_TYPE="${3:-connectedClusters}"
local COMPUTE_NS="${4:-default}"
local RESOURCE_ID
local SERVICE_TYPE="Kubernetes"
if [ "${CLUSTER_TYPE}" == "connectedClusters" ]; then
RESOURCE_ID="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP_NAME}/providers/Microsoft.Kubernetes/ConnectedClusters/${CLUSTER_NAME}"
else
# managedClusters
RESOURCE_ID="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP_NAME}/providers/Microsoft.ContainerService/managedClusters/${CLUSTER_NAME}"
fi
if
[[ $(az ml compute show --resource-group "${RESOURCE_GROUP_NAME}" --name "${COMPUTE_NAME}" --query provisioning_state --output tsv) == "Succeeded" ]]
then
echo_info "Cluster is already attached to workspace for the cluster: ${CLUSTER_NAME} as ${COMPUTE_NAME} in workspace:${WORKSPACE_NAME} under namespace: ${COMPUTE_NS}..."
else
echo_info "Detach compute ${COMPUTE_NAME} in workspace:${WORKSPACE_NAME} first, as k8s compute doesn't support update"
az ml compute detach --subscription "${SUBSCRIPTION_ID}" --resource-group "${RESOURCE_GROUP_NAME}" --workspace-name "${WORKSPACE_NAME}" --name "${COMPUTE_NAME}" -y || true
echo_info "Attaching compute to workspace for the cluster: ${CLUSTER_NAME} as ${COMPUTE_NAME} in workspace:${WORKSPACE_NAME} under namespace: ${COMPUTE_NS}"
ATTACH_COMPUTE=$(az ml compute attach \
--subscription "${SUBSCRIPTION_ID}" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--workspace-name "${WORKSPACE_NAME}" \
--type "${SERVICE_TYPE}" \
--resource-id "${RESOURCE_ID}" \
--namespace "${COMPUTE_NS}" \
--name "${COMPUTE_NAME}" \
--output tsv \
> /dev/null )
echo_info "ProvisioningState of ATTACH_COMPUTE: ${ATTACH_COMPUTE}"
fi
}
function detach_compute() {
echo_info "Detaching Kubernetes Compute"
local CLUSTER_NAME="${1:-aks-cluster}"
echo_info "Detaching compute to workspace for the cluster: ${CLUSTER_NAME} in workspace:${WORKSPACE_NAME}"
DETACH_COMPUTE=$(az ml compute detach \
--subscription "${SUBSCRIPTION_ID}" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--workspace-name "${WORKSPACE_NAME}" \
--name "${CLUSTER_NAME}" \
--yes \
--output tsv \
> /dev/null )
echo_info "ProvisioningState of DETACH_COMPUTE: ${DETACH_COMPUTE}"
}
# setup AKS
function ensure_aks_compute() {
AKS_CLUSTER_NAME=${1:-aks-cluster}
MIN_COUNT="${2:-1}"
MAX_COUNT="${3:-3}"
VM_SKU="${4:-STANDARD_D3_V2}"
compute_exists=$(az aks list --resource-group "${RESOURCE_GROUP_NAME}" --query "[?name == '${AKS_CLUSTER_NAME}']" |tail -n1|tr -d "[:cntrl:]")
if [[ "${compute_exists}" = "[]" ]]; then
echo_info "AKS Compute ${AKS_CLUSTER_NAME} does not exist; creating" >&2
CREATE_COMPUTE=$(az aks create \
--subscription "${SUBSCRIPTION_ID}" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--location "${LOCATION}" \
--name "${AKS_CLUSTER_NAME}" \
--enable-cluster-autoscaler \
--node-count "$MIN_COUNT" \
--min-count "$MIN_COUNT" \
--max-count "$MAX_COUNT" \
--node-vm-size "${VM_SKU}" \
--no-ssh-key \
--output tsv \
> /dev/null )
if [[ $? -ne 0 ]]; then
echo_error "Failed to create AKS compute ${AKS_CLUSTER_NAME}" >&2
echo_info "[---fail---] $CREATE_COMPUTE."
else
echo_info "AKS Compute ${AKS_CLUSTER_NAME} created successfully" >&2
check_aks_status
fi
else
echo_warning "AKS Compute ${AKS_CLUSTER_NAME} already exist, skipping creation step..." >&2
check_aks_status
fi
# install_k8s_extension "${AKS_CLUSTER_NAME}" "managedClusters" "Microsoft.ContainerService/managedClusters"
# setup_compute "${AKS_CLUSTER_NAME}" "managedClusters" "azureml"
}
# Check status of AKS Cluster
check_aks_status(){
MAX_RETRIES="${MAX_RETRIES:-60}"
SLEEP_SECONDS="${SLEEP_SECONDS:-20}"
for i in $(seq 1 "$MAX_RETRIES"); do
provisioningState=$(az aks show \
--subscription "${SUBSCRIPTION_ID}" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--name "${AKS_CLUSTER_NAME}" \
--query provisioningState -o tsv |tail -n1|tr -d "[:cntrl:]")
echo_info "ProvisioningState: $provisioningState for the AKS cluster: ${AKS_CLUSTER_NAME}"
if [[ $provisioningState != "Succeeded" ]]; then
sleep "${SLEEP_SECONDS}"
else
break
fi
done
[[ $provisioningState == "Succeeded" ]]
}
install_k8s_extension(){
local CLUSTER_NAME=${1:-aks-cluster}
local CLUSTER_TYPE="${2:-connectedClusters}" # or managedClusters
local RESOURCE_TYPE="${3:-Microsoft.Kubernetes/connectedClusters}" # or Microsoft.ContainerService/managedClusters
local ARC_CLUSTER_NAME
if [ "${CLUSTER_TYPE}" == "connectedClusters" ]; then
ARC_CLUSTER_NAME="${CLUSTER_NAME}-arc"
connect_arc "${CLUSTER_NAME}" "${ARC_CLUSTER_NAME}"
else
# managedClusters
ARC_CLUSTER_NAME="${CLUSTER_NAME}"
fi
if
[[ $(az k8s-extension show --cluster-type "${CLUSTER_TYPE}" -c "${ARC_CLUSTER_NAME}" -g "${RESOURCE_GROUP_NAME}" --name "${EXTENSION_NAME}" --output tsv --query provisioningState) == "Succeeded" ]]
then
echo "Extension:${EXTENSION_NAME} already installed on cluster: ${ARC_CLUSTER_NAME}"
else
echo_info "Creating k8s extension for $CLUSTER_TYPE for Azure ML extension: ${EXTENSION_NAME} on cluster: ${ARC_CLUSTER_NAME}"
EXTENSION_INSTALL_STATE=$(az k8s-extension create \
--cluster-name "${ARC_CLUSTER_NAME}" \
--cluster-type "${CLUSTER_TYPE}" \
--subscription "${SUBSCRIPTION_ID}" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--name "${EXTENSION_NAME}" \
--extension-type "$EXTENSION_TYPE" \
--auto-upgrade "$EXT_AUTO_UPGRADE" \
--scope cluster \
--release-train "$RELEASE_TRAIN" \
--configuration-settings $EXTENSION_SETTINGS \
--no-wait \
-o tsv |tail -n1|tr -d "[:cntrl:]") && echo_info "$EXTENSION_INSTALL_STATE"
check_extension_status "${ARC_CLUSTER_NAME}" "${CLUSTER_TYPE}"
fi
}
check_extension_status(){
local CLUSTER_NAME=${1:-aks-cluster}
local CLUSTER_TYPE="${2:-connectedClusters}" # or managedClusters
MAX_RETRIES="${MAX_RETRIES:-60}"
SLEEP_SECONDS="${SLEEP_SECONDS:-20}"
for i in $(seq 1 "$MAX_RETRIES"); do
provisioningState=$(az k8s-extension show \
--cluster-name "$CLUSTER_NAME" \
--cluster-type "$CLUSTER_TYPE" \
--subscription "${SUBSCRIPTION_ID}" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--name "${EXTENSION_NAME}" \
--query provisioningState -o tsv | tail -n1 | tr -d "[:cntrl:]")
echo_info "ProvisioningState: '$provisioningState' for k8s-extension on the cluster: ${CLUSTER_NAME}"
if [[ $provisioningState != "Succeeded" ]]; then
sleep "${SLEEP_SECONDS}"
else
break
fi
done
[[ $provisioningState == "Succeeded" ]] && echo_info "$CLUSTER_TYPE for Azure ML extension: ${EXTENSION_NAME} is installed successfully on cluster: ${CLUSTER_NAME}.">&2
}
deleteArcCIExtension() {
local CLUSTER_NAME=${1:-aks-cluster}
local CLUSTER_TYPE="${2:-connectedClusters}" # or managedClusters
az k8s-extension delete \
--cluster-name "$CLUSTER_NAME" \
--cluster-type "$CLUSTER_TYPE" \
--subscription "${SUBSCRIPTION_ID}" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--name "${EXTENSION_NAME}" \
--yes
}
# CPU_INSTANCE_TYPE: "4 40Gi"
# GPU_INSTANCE_TYPE: "4 40Gi 2"
# setup_instance_type defaultinstancetype $GPU_INSTANCE_TYPE
# setup_instance_type cpu $CPU_INSTANCE_TYPE
# setup_instance_type gpu $GPU_INSTANCE_TYPE
setup_instance_type(){
INSTANCE_TYPE_NAME="${1:-$INSTANCE_TYPE_NAME}"
CPU="${2:-$CPU}"
MEMORY="${3:-$MEMORY}"
GPU="${4:-$GPU}"
cat <<EOF | kubectl apply -f -
apiVersion: amlarc.azureml.com/v1alpha1
kind: InstanceType
metadata:
name: $INSTANCE_TYPE_NAME
spec:
resources:
limits:
cpu: "$CPU"
memory: "$MEMORY"
nvidia.com/gpu: $GPU
requests:
cpu: "$CPU"
memory: "$MEMORY"
EOF
}
setup_instance_type_aml_arc(){
local ARC_CLUSTER_NAME="${1:-amlarc-inference}"
get_kubeconfig "${ARC_CLUSTER_NAME}"
setup_instance_type defaultinstancetype $CPU_INSTANCE_TYPE
setup_instance_type cpu $CPU_INSTANCE_TYPE
}
generate_workspace_config(){
local CONFIG_PATH=${1:-.azureml/config}
local FOLDER_NAME=$(echo "${CONFIG_PATH}" | rev | cut -d"/" -f2- | rev | tr -d '"' | tr -d '"\r\n')
echo "Location of the config: ${FOLDER_NAME}"
[[ -d "${FOLDER_NAME}" ]] && echo "Directory exists: ${FOLDER_NAME}" || mkdir -p "${FOLDER_NAME}";
cat << EOF > "${CONFIG_PATH}"
{
"subscription_id": "$SUBSCRIPTION_ID",
"resource_group": "$RESOURCE_GROUP_NAME",
"workspace_name": "$WORKSPACE_NAME"
}
EOF
}
function vmss_upgrade_policy_automatic() {
local LOCAL_RESOURCE_GROUP_NAME=${1:-testrg}
printf "Update VMSS upgrade policy in resource group %s\n" ${LOCAL_RESOURCE_GROUP_NAME}
# get list of all scale sets
VM_SCALE_SETS=$(az vmss list --subscription "${SUBSCRIPTION_ID}" --resource-group ${LOCAL_RESOURCE_GROUP_NAME} --query '[].name' --output tsv)
printf "Checking scalesets %s in resource-group %s\n" "${VM_SCALE_SETS}" "${LOCAL_RESOURCE_GROUP_NAME}"
# temporarily disable the flag
set +e
for VMSS in ${VM_SCALE_SETS}; do
VMSS_UPGRADE_POLICY_MODE=$(az vmss show --subscription "${SUBSCRIPTION_ID}" --resource-group ${LOCAL_RESOURCE_GROUP_NAME} --name $VMSS --query upgradePolicy.mode --output tsv)
# az vmss show -g "${LOCAL_RESOURCE_GROUP_NAME}" -n "${VMSS}" -o json
if [[ "$VMSS_UPGRADE_POLICY_MODE" == "Automatic" ]]; then
echo_info "Skipping to update upgradePolicy for VMSS $VMSS in resource-group ${LOCAL_RESOURCE_GROUP_NAME}..."
continue
else
echo_info "Enabling Auto OS Image upgrade for VMSS $VMSS in resource-group ${LOCAL_RESOURCE_GROUP_NAME}..."
az vmss update --subscription "${SUBSCRIPTION_ID}" -g "${LOCAL_RESOURCE_GROUP_NAME}" -n "${VMSS}" --set upgradePolicy.automaticOSUpgradePolicy='{"enableAutomaticOSUpgrade": true, "disableAutomaticRollback": false }'
echo_info "Updating upgradePolicy to Automatic for VMSS $VMSS in resource-group ${LOCAL_RESOURCE_GROUP_NAME}..."
az vmss update --subscription "${SUBSCRIPTION_ID}" -g "${LOCAL_RESOURCE_GROUP_NAME}" -n "${VMSS}" --set upgradePolicy.mode='Automatic'
fi
# az vmss show --subscription "${SUBSCRIPTION_ID}" -g "${LOCAL_RESOURCE_GROUP_NAME}" -n "${VMSS}" --query upgradePolicy -o json
done
# return to the default
set -e
}
function vmss_upgrade_policy_all_rg() {
local RG_PREFIX="${1:-MC_}"
local Tag_Name="EnableAzSecPackIdentityPolicy"
local Tag_Value="true"
# checking Resource group name to ensure we're in a managed cluster RG
echo "Number of Resource groups starting with ${RG_PREFIX}:" $(az group list --subscription "${SUBSCRIPTION_ID}" --query "[? starts_with(@.name, '${RG_PREFIX}')] | length(@)")
# az group list --query "[? starts_with(@.name, '${RG_PREFIX}')].name" -o tsv | xargs -i "$SCRIPT_DIR"/sdk_helpers.sh check_vmss "{}"
for LOCAL_RESOURCE_GROUP_NAME in $(az group list --subscription "${SUBSCRIPTION_ID}" --query "[? starts_with(@.name, '${RG_PREFIX}')].name" --output tsv); do
# resource_id=$(az resource list --resource-group "${LOCAL_RESOURCE_GROUP_NAME}" --query [].id --output tsv)
RESOURCE_GROUP_ID=$(az group show --subscription "${SUBSCRIPTION_ID}" --name "${LOCAL_RESOURCE_GROUP_NAME}" --query id -o tsv | tail -n1 | tr -d "[:cntrl:]")
echo "Current tags for resource-group ${LOCAL_RESOURCE_GROUP_NAME}"
az tag list --subscription "${SUBSCRIPTION_ID}" --resource-id "${RESOURCE_GROUP_ID}"
# echo "Update tag for RG ""$RESOURCE_GROUP_ID"" $Tag_Name tag to ""$Tag_Value"
az tag update --subscription "${SUBSCRIPTION_ID}" --resource-id "$RESOURCE_GROUP_ID" --operation Merge --tags "$Tag_Name"="$Tag_Value"
echo "Updated tags for resource-group ${LOCAL_RESOURCE_GROUP_NAME}:"
az tag list --subscription "${SUBSCRIPTION_ID}" --resource-id "${RESOURCE_GROUP_ID}"
vmss_upgrade_policy_automatic "${LOCAL_RESOURCE_GROUP_NAME}"
done
}
function validate_tool() {
which "$1" &>/dev/null
if [ $? -ne 0 ]; then
echo >&2 "Error: Unable to find required '$1' tool."
return 1
else
return 0
fi
}
# -e "s/max_trials = 5/max_trials=1/g"
function replace_template_values() {
local FILENAME="$1"
echo "Replacing template values in the file: ${FILENAME}"
sed -i -e "s/<SUBSCRIPTION_ID>/$(echo "$SUBSCRIPTION_ID")/g" \
-e "s/<RESOURCE_GROUP>/$(echo "$RESOURCE_GROUP_NAME")/g" \
-e "s/<AML_WORKSPACE_NAME>/$(echo "$WORKSPACE_NAME")/g" \
-e "s/<REGISTRY_NAME>/$(echo "$REGISTRY_NAME")/g" \
-e "s/<CLUSTER_NAME>/$(echo "$ARC_CLUSTER_NAME")/g" \
-e "s/<COMPUTE_NAME>/$(echo "$ARC_COMPUTE_NAME")/g" \
-e "s/<TIME_STAMP>/$(echo "$timestamp")/g" \
-e "s/DefaultAzureCredential/AzureCliCredential/g" \
-e "s/InteractiveBrowserCredential/AzureCliCredential/g" \
-e "s/@pipeline(/&force_rerun=True,/g" \
-e "s/ml_client.begin_create_or_update(ws_with_existing)/# ml_client.begin_create_or_update(ws_with_existing)/g" \
-e "s/ml_client.workspaces.begin_create(ws_private_link)/# ml_client.workspaces.begin_create(ws_private_link)/g" \
-e "s/ml_client.workspaces.begin_create(ws_private_link)/# ws_from_config = MLClient.from_config()/g" \
-e "s/version=mltable_version/version=1/g" \
-e "s/max_trials=10/max_trials=2/g" \
-e "s/max_trials: 10/max_trials: 2/g" \
"${FILENAME}"
echo "$(<"${FILENAME}")"
}
function replace_workspace_info() {
local FILENAME="$1"
echo "Replacing workspace information in the file: ${FILENAME}"
sed -i -e "s/<SUBSCRIPTION_ID>/$(echo "$SUBSCRIPTION_ID")/g" \
-e "s/<RESOURCE_GROUP>/$(echo "$RESOURCE_GROUP_NAME")/g" \
-e "s/<WORKSPACE_NAME>/$(echo "$WORKSPACE_NAME")/g" \
-e "s/<REGISTRY_NAME>/$(echo "$REGISTRY_NAME")/g" \
"${FILENAME}"
echo "$(<"${FILENAME}")"
}
function replace_version(){
local FILENAME="$1"
echo "Replacing version in the file: ${FILENAME}"
sed -i -e "s/<VERSION>/$(echo "$timestamp")/g" \
"${FILENAME}"
echo "$(<"${FILENAME}")"
}
function ensure_k8s_compute(){
# Arc cluster configuration
arc_compute=${ARC_CLUSTER_NAME}
echo_info "Checking amlarc cluster: '$arc_compute'"
# Remove AKS if unhealthy
compute_exists=$(az aks list --resource-group "${RESOURCE_GROUP_NAME}" --query "[?name == '${arc_compute}']" |tail -n1|tr -d "[:cntrl:]")
if [[ "${compute_exists}" = "[]" ]]; then
echo_info "AKS Compute ${arc_compute} does not exist; will create"
else
if ! AKS_CLUSTER_NAME=${arc_compute} MAX_RETRIES=10 SLEEP_SECONDS=30 check_aks_status; then
echo_info "Remove unhealthy AKS: '$arc_compute'"
az aks delete --resource-group "${RESOURCE_GROUP_NAME}" --name ${arc_compute} --yes
fi
fi
# Remove Arc if unhealthy
compute_exists=$(az connectedk8s list --resource-group "${RESOURCE_GROUP_NAME}" --query "[?name == '${arc_compute}-arc']" |tail -n1|tr -d "[:cntrl:]")
if [[ "${compute_exists}" = "[]" ]]; then
echo_info "Arc Compute ${arc_compute}-arc does not exist; will create"
else
clusterState=$(az connectedk8s show --resource-group "${RESOURCE_GROUP_NAME}" --name "${arc_compute}-arc" --query connectivityStatus -o tsv)
echo_info "Cluster: ${arc_compute}-arc current state: ${clusterState}"
if [[ "${clusterState}" != "Connected" ]]; then
echo_info "Remove unhealthy ARC: '${arc_compute}-arc'"
az connectedk8s delete --resource-group "${RESOURCE_GROUP_NAME}" --name "${arc_compute}-arc" --yes
fi
fi
# Remove k8s compute if unhealthy
compute_exists=$(az ml compute list --resource-group "${RESOURCE_GROUP_NAME}" --query "[?name == '${ARC_COMPUTE_NAME}']" |tail -n1|tr -d "[:cntrl:]")
if [[ "${compute_exists}" = "[]" ]]; then
echo_info "K8s Compute ${arc_compute}-arc does not exist; will create"
else
Status=$(az ml compute show --resource-group "${RESOURCE_GROUP_NAME}" --name "${ARC_COMPUTE_NAME}" --query provisioning_state --output tsv)
if
[[ $Status == "Succeeded" ]]
then
echo_info "K8s Compute is healthy: $Status"
else
echo_info "K8s Compute is unhealthy: $Status"
az ml compute detach --subscription "${SUBSCRIPTION_ID}" --resource-group "${RESOURCE_GROUP_NAME}" --workspace-name "${WORKSPACE_NAME}" --name "${ARC_COMPUTE_NAME}" -y || true
fi
fi
LOCATION=eastus2 ensure_aks_compute "${arc_compute}" 1 3 "STANDARD_D3_V2"
install_k8s_extension "${arc_compute}" "connectedClusters" "Microsoft.Kubernetes/connectedClusters"
setup_compute "${arc_compute}-arc" "${ARC_COMPUTE_NAME}" "connectedClusters" "azureml"
setup_instance_type_aml_arc "${arc_compute}"
echo_info ">>> Done creating amlarc clusters"
}
help(){
echo "All functions:"
declare -F
}
if [[ "$0" = "$BASH_SOURCE" ]]; then
"$@"
fi