cleanup: remove unused self-contained content (#5060)

Co-authored-by: Cameron Meissner <cameissner@microsoft.com>
This commit is contained in:
Cameron Meissner 2024-10-10 08:57:50 -07:00 коммит произвёл GitHub
Родитель f9bcb7af2c
Коммит 71c8d8a7b0
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
7 изменённых файлов: 1 добавлений и 2305 удалений

Просмотреть файл

@ -25,7 +25,7 @@ else
echo "shellcheck installed"
fi
filesToCheck=$(find . -type f -name "*.sh" -not -path './parts/linux/cloud-init/artifacts/*' -not -path './pkg/agent/testdata/*' -not -path './vendor/*' -not -path './hack/tools/vendor/*' -not -path './.git/*' -not -path './self-contained/*' -not -path './hack/tools/bin/shellspecsrc/*')
filesToCheck=$(find . -type f -name "*.sh" -not -path './parts/linux/cloud-init/artifacts/*' -not -path './pkg/agent/testdata/*' -not -path './vendor/*' -not -path './hack/tools/vendor/*' -not -path './.git/*' -not -path './hack/tools/bin/shellspecsrc/*')
# also shell-check generated test data
generatedTestData=$(find ./pkg/agent/testdata -type f -name "*.sh" )

Просмотреть файл

@ -1,156 +0,0 @@
PROVISION_OUTPUT="/var/log/azure/cluster-provision-cse-output.log";
echo $(date),$(hostname) > ${PROVISION_OUTPUT};
{{if ShouldEnableCustomData}}
cloud-init status --wait > /dev/null 2>&1;
[ $? -ne 0 ] && echo 'cloud-init failed' >> ${PROVISION_OUTPUT} && exit 1;
echo "cloud-init succeeded" >> ${PROVISION_OUTPUT};
{{end}}
{{if IsAKSCustomCloud}}
REPO_DEPOT_ENDPOINT="{{AKSCustomCloudRepoDepotEndpoint}}"
{{GetInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1;
{{end}}
ADMINUSER={{GetParameter "linuxAdminUsername"}}
MOBY_VERSION={{GetParameter "mobyVersion"}}
TENANT_ID={{GetVariable "tenantID"}}
KUBERNETES_VERSION={{GetParameter "kubernetesVersion"}}
HYPERKUBE_URL={{GetParameter "kubernetesHyperkubeSpec"}}
KUBE_BINARY_URL={{GetParameter "kubeBinaryURL"}}
CUSTOM_KUBE_BINARY_URL={{GetParameter "customKubeBinaryURL"}}
PRIVATE_KUBE_BINARY_URL="{{GetLinuxPrivatePackageURL}}"
KUBEPROXY_URL={{GetParameter "kubeProxySpec"}}
APISERVER_PUBLIC_KEY={{GetParameter "apiServerCertificate"}}
SUBSCRIPTION_ID={{GetVariable "subscriptionId"}}
RESOURCE_GROUP={{GetVariable "resourceGroup"}}
LOCATION={{GetVariable "location"}}
VM_TYPE={{GetVariable "vmType"}}
SUBNET={{GetVariable "subnetName"}}
NETWORK_SECURITY_GROUP={{GetVariable "nsgName"}}
VIRTUAL_NETWORK={{GetVariable "virtualNetworkName"}}
VIRTUAL_NETWORK_RESOURCE_GROUP={{GetVariable "virtualNetworkResourceGroupName"}}
ROUTE_TABLE={{GetVariable "routeTableName"}}
PRIMARY_AVAILABILITY_SET={{GetVariable "primaryAvailabilitySetName"}}
PRIMARY_SCALE_SET={{GetVariable "primaryScaleSetName"}}
SERVICE_PRINCIPAL_CLIENT_ID={{GetParameter "servicePrincipalClientId"}}
NETWORK_PLUGIN={{GetParameter "networkPlugin"}}
NETWORK_POLICY={{GetParameter "networkPolicy"}}
VNET_CNI_PLUGINS_URL={{GetParameter "vnetCniLinuxPluginsURL"}}
CLOUDPROVIDER_BACKOFF={{GetParameterProperty "cloudproviderConfig" "cloudProviderBackoff"}}
CLOUDPROVIDER_BACKOFF_MODE={{GetParameterProperty "cloudproviderConfig" "cloudProviderBackoffMode"}}
CLOUDPROVIDER_BACKOFF_RETRIES={{GetParameterProperty "cloudproviderConfig" "cloudProviderBackoffRetries"}}
CLOUDPROVIDER_BACKOFF_EXPONENT={{GetParameterProperty "cloudproviderConfig" "cloudProviderBackoffExponent"}}
CLOUDPROVIDER_BACKOFF_DURATION={{GetParameterProperty "cloudproviderConfig" "cloudProviderBackoffDuration"}}
CLOUDPROVIDER_BACKOFF_JITTER={{GetParameterProperty "cloudproviderConfig" "cloudProviderBackoffJitter"}}
CLOUDPROVIDER_RATELIMIT={{GetParameterProperty "cloudproviderConfig" "cloudProviderRateLimit"}}
CLOUDPROVIDER_RATELIMIT_QPS={{GetParameterProperty "cloudproviderConfig" "cloudProviderRateLimitQPS"}}
CLOUDPROVIDER_RATELIMIT_QPS_WRITE={{GetParameterProperty "cloudproviderConfig" "cloudProviderRateLimitQPSWrite"}}
CLOUDPROVIDER_RATELIMIT_BUCKET={{GetParameterProperty "cloudproviderConfig" "cloudProviderRateLimitBucket"}}
CLOUDPROVIDER_RATELIMIT_BUCKET_WRITE={{GetParameterProperty "cloudproviderConfig" "cloudProviderRateLimitBucketWrite"}}
LOAD_BALANCER_DISABLE_OUTBOUND_SNAT={{GetParameterProperty "cloudproviderConfig" "cloudProviderDisableOutboundSNAT"}}
USE_MANAGED_IDENTITY_EXTENSION={{GetVariable "useManagedIdentityExtension"}}
USE_INSTANCE_METADATA={{GetVariable "useInstanceMetadata"}}
LOAD_BALANCER_SKU={{GetVariable "loadBalancerSku"}}
EXCLUDE_MASTER_FROM_STANDARD_LB={{GetVariable "excludeMasterFromStandardLB"}}
MAXIMUM_LOADBALANCER_RULE_COUNT={{GetVariable "maximumLoadBalancerRuleCount"}}
CONTAINER_RUNTIME={{GetParameter "containerRuntime"}}
CLI_TOOL={{GetParameter "cliTool"}}
CONTAINERD_DOWNLOAD_URL_BASE={{GetParameter "containerdDownloadURLBase"}}
NETWORK_MODE={{GetParameter "networkMode"}}
KUBE_BINARY_URL={{GetParameter "kubeBinaryURL"}}
USER_ASSIGNED_IDENTITY_ID={{GetVariable "userAssignedIdentityID"}}
API_SERVER_NAME={{GetKubernetesEndpoint}}
IS_VHD={{GetVariable "isVHD"}}
GPU_NODE={{GetVariable "gpuNode"}}
SGX_NODE={{GetVariable "sgxNode"}}
MIG_NODE={{GetVariable "migNode"}}
CONFIG_GPU_DRIVER_IF_NEEDED={{GetVariable "configGPUDriverIfNeeded"}}
ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED={{GetVariable "enableGPUDevicePluginIfNeeded"}}
TELEPORTD_PLUGIN_DOWNLOAD_URL={{GetParameter "teleportdPluginURL"}}
CONTAINERD_VERSION={{GetParameter "containerdVersion"}}
CONTAINERD_PACKAGE_URL={{GetParameter "containerdPackageURL"}}
RUNC_VERSION={{GetParameter "runcVersion"}}
RUNC_PACKAGE_URL={{GetParameter "runcPackageURL"}}
ENABLE_HOSTS_CONFIG_AGENT="{{EnableHostsConfigAgent}}"
DISABLE_SSH="{{ShouldDisableSSH}}"
NEEDS_CONTAINERD="{{NeedsContainerd}}"
TELEPORT_ENABLED="{{TeleportEnabled}}"
SHOULD_CONFIGURE_HTTP_PROXY="{{ShouldConfigureHTTPProxy}}"
SHOULD_CONFIGURE_HTTP_PROXY_CA="{{ShouldConfigureHTTPProxyCA}}"
HTTP_PROXY_TRUSTED_CA="{{GetHTTPProxyCA}}"
SHOULD_CONFIGURE_CUSTOM_CA_TRUST="{{ShouldConfigureCustomCATrust}}"
CUSTOM_CA_TRUST_COUNT="{{len GetCustomCATrustConfigCerts}}"
{{range $i, $cert := GetCustomCATrustConfigCerts}}
CUSTOM_CA_CERT_{{$i}}="{{$cert}}"
{{end}}
IS_KRUSTLET="{{IsKrustlet}}"
GPU_NEEDS_FABRIC_MANAGER="{{GPUNeedsFabricManager}}"
#NEEDS_DOCKER_LOGIN="{{and IsDockerContainerRuntime HasPrivateAzureRegistryServer}}" This field is no longer required for the new contract since Docker is out of support and its value depends on Container Runtime = Docker
IPV6_DUAL_STACK_ENABLED="{{IsIPv6DualStackFeatureEnabled}}"
OUTBOUND_COMMAND="{{GetOutboundCommand}}"
ENABLE_UNATTENDED_UPGRADES="{{EnableUnattendedUpgrade}}"
ENSURE_NO_DUPE_PROMISCUOUS_BRIDGE="{{ and NeedsContainerd IsKubenet (not HasCalicoNetworkPolicy) }}"
SHOULD_CONFIG_SWAP_FILE="{{ShouldConfigSwapFile}}"
SHOULD_CONFIG_TRANSPARENT_HUGE_PAGE="{{ShouldConfigTransparentHugePage}}"
SHOULD_CONFIG_CONTAINERD_ULIMITS="{{ShouldConfigContainerdUlimits}}"
CONTAINERD_ULIMITS="{{GetContainerdUlimitString}}"
{{/* both CLOUD and ENVIRONMENT have special values when IsAKSCustomCloud == true */}}
{{/* CLOUD uses AzureStackCloud and seems to be used by kubelet, k8s cloud provider */}}
{{/* target environment seems to go to ARM SDK config */}}
{{/* not sure why separate/inconsistent? */}}
{{/* see GetCustomEnvironmentJSON for more weirdness. */}}
TARGET_CLOUD="{{- if IsAKSCustomCloud -}} AzureStackCloud {{- else -}} {{GetTargetEnvironment}} {{- end -}}"
TARGET_ENVIRONMENT="{{GetTargetEnvironment}}"
CUSTOM_ENV_JSON="{{GetBase64EncodedEnvironmentJSON}}"
IS_CUSTOM_CLOUD="{{IsAKSCustomCloud}}"
CSE_HELPERS_FILEPATH="{{GetCSEHelpersScriptFilepath}}"
CSE_DISTRO_HELPERS_FILEPATH="{{GetCSEHelpersScriptDistroFilepath}}"
CSE_INSTALL_FILEPATH="{{GetCSEInstallScriptFilepath}}"
CSE_DISTRO_INSTALL_FILEPATH="{{GetCSEInstallScriptDistroFilepath}}"
CSE_CONFIG_FILEPATH="{{GetCSEConfigScriptFilepath}}"
AZURE_PRIVATE_REGISTRY_SERVER="{{GetPrivateAzureRegistryServer}}"
HAS_CUSTOM_SEARCH_DOMAIN="{{HasCustomSearchDomain}}"
CUSTOM_SEARCH_DOMAIN_FILEPATH="{{GetCustomSearchDomainsCSEScriptFilepath}}"
HTTP_PROXY_URLS="{{GetHTTPProxy}}"
HTTPS_PROXY_URLS="{{GetHTTPSProxy}}"
NO_PROXY_URLS="{{GetNoProxy}}"
PROXY_VARS="{{GetProxyVariables}}"
ENABLE_TLS_BOOTSTRAPPING="{{EnableTLSBootstrapping}}"
ENABLE_SECURE_TLS_BOOTSTRAPPING="{{EnableSecureTLSBootstrapping}}"
DHCPV6_SERVICE_FILEPATH="{{GetDHCPv6ServiceCSEScriptFilepath}}"
DHCPV6_CONFIG_FILEPATH="{{GetDHCPv6ConfigCSEScriptFilepath}}"
THP_ENABLED="{{GetTransparentHugePageEnabled}}"
THP_DEFRAG="{{GetTransparentHugePageDefrag}}"
SERVICE_PRINCIPAL_FILE_CONTENT="{{GetServicePrincipalSecret}}"
KUBELET_CLIENT_CONTENT="{{GetKubeletClientKey}}"
KUBELET_CLIENT_CERT_CONTENT="{{GetKubeletClientCert}}"
KUBELET_CONFIG_FILE_ENABLED="{{IsKubeletConfigFileEnabled}}"
KUBELET_CONFIG_FILE_CONTENT="{{GetKubeletConfigFileContentBase64}}"
SWAP_FILE_SIZE_MB="{{GetSwapFileSizeMB}}"
GPU_DRIVER_VERSION="{{GPUDriverVersion}}"
GPU_INSTANCE_PROFILE="{{GetGPUInstanceProfile}}"
CUSTOM_SEARCH_DOMAIN_NAME="{{GetSearchDomainName}}"
CUSTOM_SEARCH_REALM_USER="{{GetSearchDomainRealmUser}}"
CUSTOM_SEARCH_REALM_PASSWORD="{{GetSearchDomainRealmPassword}}"
MESSAGE_OF_THE_DAY="{{GetMessageOfTheDay}}"
HAS_KUBELET_DISK_TYPE="{{HasKubeletDiskType}}"
NEEDS_CGROUPV2="{{IsCgroupV2}}"
TLS_BOOTSTRAP_TOKEN="{{GetTLSBootstrapTokenForKubeConfig}}"
KUBELET_FLAGS="{{GetKubeletConfigKeyVals}}"
NETWORK_POLICY="{{GetParameter "networkPolicy"}}"
{{- if not (IsKubernetesVersionGe "1.17.0")}}
KUBELET_IMAGE="{{GetHyperkubeImageReference}}"
{{end}}
{{if IsKubernetesVersionGe "1.16.0"}}
KUBELET_NODE_LABELS="{{GetAgentKubernetesLabels . }}"
{{else}}
KUBELET_NODE_LABELS="{{GetAgentKubernetesLabelsDeprecated . }}"
{{end}}
AZURE_ENVIRONMENT_FILEPATH="{{- if IsAKSCustomCloud}}/etc/kubernetes/{{GetTargetEnvironment}}.json{{end}}"
KUBE_CA_CRT="{{GetParameter "caCertificate"}}"
KUBENET_TEMPLATE="{{GetKubenetTemplate}}"
CONTAINERD_CONFIG_CONTENT="{{GetContainerdConfigContent}}"
CONTAINERD_CONFIG_NO_GPU_CONTENT="{{GetContainerdConfigNoGPUContent}}"
IS_KATA="{{IsKata}}"
ARTIFACT_STREAMING_ENABLED="{{IsArtifactStreamingEnabled}}"
SYSCTL_CONTENT="{{GetSysctlContent}}"
PRIVATE_EGRESS_PROXY_ADDRESS="{{GetPrivateEgressProxyAddress}}"
/usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh"

Просмотреть файл

@ -1,697 +0,0 @@
#!/bin/bash
NODE_INDEX=$(hostname | tail -c 2)
NODE_NAME=$(hostname)
configureAdminUser(){
chage -E -1 -I -1 -m 0 -M 99999 "${ADMINUSER}"
chage -l "${ADMINUSER}"
}
configPrivateClusterHosts() {
mkdir -p /etc/systemd/system/reconcile-private-hosts.service.d/
touch /etc/systemd/system/reconcile-private-hosts.service.d/10-fqdn.conf
tee /etc/systemd/system/reconcile-private-hosts.service.d/10-fqdn.conf > /dev/null <<EOF
[Service]
Environment="KUBE_API_SERVER_NAME=${API_SERVER_NAME}"
EOF
systemctlEnableAndStart reconcile-private-hosts || exit $ERR_SYSTEMCTL_START_FAIL
}
configureTransparentHugePage() {
ETC_SYSFS_CONF="/etc/sysfs.conf"
if [[ "${THP_ENABLED}" != "" ]]; then
echo "${THP_ENABLED}" > /sys/kernel/mm/transparent_hugepage/enabled
echo "kernel/mm/transparent_hugepage/enabled=${THP_ENABLED}" >> ${ETC_SYSFS_CONF}
fi
if [[ "${THP_DEFRAG}" != "" ]]; then
echo "${THP_DEFRAG}" > /sys/kernel/mm/transparent_hugepage/defrag
echo "kernel/mm/transparent_hugepage/defrag=${THP_DEFRAG}" >> ${ETC_SYSFS_CONF}
fi
}
configureSwapFile() {
# https://learn.microsoft.com/en-us/troubleshoot/azure/virtual-machines/troubleshoot-device-names-problems#identify-disk-luns
swap_size_kb=$(expr ${SWAP_FILE_SIZE_MB} \* 1000)
swap_location=""
# Attempt to use the resource disk
if [[ -L /dev/disk/azure/resource-part1 ]]; then
resource_disk_path=$(findmnt -nr -o target -S $(readlink -f /dev/disk/azure/resource-part1))
disk_free_kb=$(df ${resource_disk_path} | sed 1d | awk '{print $4}')
if [[ ${disk_free_kb} -gt ${swap_size_kb} ]]; then
echo "Will use resource disk for swap file"
swap_location=${resource_disk_path}/swapfile
else
echo "Insufficient disk space on resource disk to create swap file: request ${swap_size_kb} free ${disk_free_kb}, attempting to fall back to OS disk..."
fi
fi
# If we couldn't use the resource disk, attempt to use the OS disk
if [[ -z "${swap_location}" ]]; then
# Directly check size on the root directory since we can't rely on 'root-part1' always being the correct label
os_device=$(readlink -f /dev/disk/azure/root)
disk_free_kb=$(df -P / | sed 1d | awk '{print $4}')
if [[ ${disk_free_kb} -gt ${swap_size_kb} ]]; then
echo "Will use OS disk for swap file"
swap_location=/swapfile
else
echo "Insufficient disk space on OS device ${os_device} to create swap file: request ${swap_size_kb} free ${disk_free_kb}"
exit $ERR_SWAP_CREATE_INSUFFICIENT_DISK_SPACE
fi
fi
echo "Swap file will be saved to: ${swap_location}"
retrycmd_if_failure 24 5 25 fallocate -l ${swap_size_kb}K ${swap_location} || exit $ERR_SWAP_CREATE_FAIL
chmod 600 ${swap_location}
retrycmd_if_failure 24 5 25 mkswap ${swap_location} || exit $ERR_SWAP_CREATE_FAIL
retrycmd_if_failure 24 5 25 swapon ${swap_location} || exit $ERR_SWAP_CREATE_FAIL
retrycmd_if_failure 24 5 25 swapon --show | grep ${swap_location} || exit $ERR_SWAP_CREATE_FAIL
echo "${swap_location} none swap sw 0 0" >> /etc/fstab
}
configureEtcEnvironment() {
mkdir -p /etc/systemd/system.conf.d/
touch /etc/systemd/system.conf.d/proxy.conf
chmod 0644 /etc/systemd/system.conf.d/proxy.conf
mkdir -p /etc/apt/apt.conf.d
touch /etc/apt/apt.conf.d/95proxy
chmod 0644 /etc/apt/apt.conf.d/95proxy
# TODO(ace): this pains me but quick and dirty refactor
echo "[Manager]" >> /etc/systemd/system.conf.d/proxy.conf
if [ "${HTTP_PROXY_URLS}" != "" ]; then
echo "HTTP_PROXY=${HTTP_PROXY_URLS}" >> /etc/environment
echo "http_proxy=${HTTP_PROXY_URLS}" >> /etc/environment
echo "Acquire::http::proxy \"${HTTP_PROXY_URLS}\";" >> /etc/apt/apt.conf.d/95proxy
echo "DefaultEnvironment=\"HTTP_PROXY=${HTTP_PROXY_URLS}\"" >> /etc/systemd/system.conf.d/proxy.conf
echo "DefaultEnvironment=\"http_proxy=${HTTP_PROXY_URLS}\"" >> /etc/systemd/system.conf.d/proxy.conf
fi
if [ "${HTTPS_PROXY_URLS}" != "" ]; then
echo "HTTPS_PROXY=${HTTPS_PROXY_URLS}" >> /etc/environment
echo "https_proxy=${HTTPS_PROXY_URLS}" >> /etc/environment
echo "Acquire::https::proxy \"${HTTPS_PROXY_URLS}\";" >> /etc/apt/apt.conf.d/95proxy
echo "DefaultEnvironment=\"HTTPS_PROXY=${HTTPS_PROXY_URLS}\"" >> /etc/systemd/system.conf.d/proxy.conf
echo "DefaultEnvironment=\"https_proxy=${HTTPS_PROXY_URLS}\"" >> /etc/systemd/system.conf.d/proxy.conf
fi
if [ "${NO_PROXY_URLS}" != "" ]; then
echo "NO_PROXY=${NO_PROXY_URLS}" >> /etc/environment
echo "no_proxy=${NO_PROXY_URLS}" >> /etc/environment
echo "DefaultEnvironment=\"NO_PROXY=${NO_PROXY_URLS}\"" >> /etc/systemd/system.conf.d/proxy.conf
echo "DefaultEnvironment=\"no_proxy=${NO_PROXY_URLS}\"" >> /etc/systemd/system.conf.d/proxy.conf
fi
# for kubelet to pick up the proxy
mkdir -p "/etc/systemd/system/kubelet.service.d"
tee "/etc/systemd/system/kubelet.service.d/10-httpproxy.conf" > /dev/null <<'EOF'
[Service]
EnvironmentFile=/etc/environment
EOF
}
configureHTTPProxyCA() {
if isMarinerOrAzureLinux "$OS"; then
cert_dest="/usr/share/pki/ca-trust-source/anchors"
update_cmd="update-ca-trust"
else
cert_dest="/usr/local/share/ca-certificates"
update_cmd="update-ca-certificates"
fi
echo "${HTTP_PROXY_TRUSTED_CA}" | base64 -d > "${cert_dest}/proxyCA.crt" || exit $ERR_UPDATE_CA_CERTS
$update_cmd || exit $ERR_UPDATE_CA_CERTS
}
configureCustomCaCertificate() {
mkdir -p /opt/certs
for i in $(seq 0 $((${CUSTOM_CA_TRUST_COUNT} - 1))); do
# directly referring to the variable as "${CUSTOM_CA_CERT_${i}}"
# causes bad substitution errors in bash
# dynamically declare and use `!` to add a layer of indirection
declare varname=CUSTOM_CA_CERT_${i}
echo "${!varname}" | base64 -d > /opt/certs/00000000000000cert${i}.crt
done
# This will block until the service is considered active.
# Update_certs.service is a oneshot type of unit that
# is considered active when the ExecStart= command terminates with a zero status code.
systemctl restart update_certs.service || exit $ERR_UPDATE_CA_CERTS
# after new certs are added to trust store, containerd will not pick them up properly before restart.
# aim here is to have this working straight away for a freshly provisioned node
# so we force a restart after the certs are updated
# custom CA daemonset copies certs passed by the user to the node, what then triggers update_certs.path unit
# path unit then triggers the script that copies over cert files to correct location on the node and updates the trust store
# as a part of this flow we could restart containerd everytime a new cert is added to the trust store using custom CA
systemctl restart containerd
}
configureContainerdUlimits() {
CONTAINERD_ULIMIT_DROP_IN_FILE_PATH="/etc/systemd/system/containerd.service.d/set_ulimits.conf"
touch "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}"
chmod 0600 "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}"
tee "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}" > /dev/null <<EOF
$(echo "$CONTAINERD_ULIMITS" | tr ' ' '\n')
EOF
systemctl daemon-reload
systemctl restart containerd
}
configureKubeletServerCert() {
KUBELET_SERVER_PRIVATE_KEY_PATH="/etc/kubernetes/certs/kubeletserver.key"
KUBELET_SERVER_CERT_PATH="/etc/kubernetes/certs/kubeletserver.crt"
openssl genrsa -out $KUBELET_SERVER_PRIVATE_KEY_PATH 2048
openssl req -new -x509 -days 7300 -key $KUBELET_SERVER_PRIVATE_KEY_PATH -out $KUBELET_SERVER_CERT_PATH -subj "/CN=${NODE_NAME}" -addext "subjectAltName=DNS:${NODE_NAME}"
}
configureK8s() {
APISERVER_PUBLIC_KEY_PATH="/etc/kubernetes/certs/apiserver.crt"
touch "${APISERVER_PUBLIC_KEY_PATH}"
chmod 0644 "${APISERVER_PUBLIC_KEY_PATH}"
chown root:root "${APISERVER_PUBLIC_KEY_PATH}"
AZURE_JSON_PATH="/etc/kubernetes/azure.json"
touch "${AZURE_JSON_PATH}"
chmod 0600 "${AZURE_JSON_PATH}"
chown root:root "${AZURE_JSON_PATH}"
mkdir -p "/etc/kubernetes/certs"
set +x
if [ -n "${KUBELET_CLIENT_CONTENT}" ]; then
echo "${KUBELET_CLIENT_CONTENT}" | base64 -d > /etc/kubernetes/certs/client.key
fi
if [ -n "${KUBELET_CLIENT_CERT_CONTENT}" ]; then
echo "${KUBELET_CLIENT_CERT_CONTENT}" | base64 -d > /etc/kubernetes/certs/client.crt
fi
if [ -n "${SERVICE_PRINCIPAL_FILE_CONTENT}" ]; then
echo "${SERVICE_PRINCIPAL_FILE_CONTENT}" | base64 -d > /etc/kubernetes/sp.txt
fi
echo "${APISERVER_PUBLIC_KEY}" | base64 --decode > "${APISERVER_PUBLIC_KEY_PATH}"
# Perform the required JSON escaping
SP_FILE="/etc/kubernetes/sp.txt"
SERVICE_PRINCIPAL_CLIENT_SECRET="$(cat "$SP_FILE")"
SERVICE_PRINCIPAL_CLIENT_SECRET=${SERVICE_PRINCIPAL_CLIENT_SECRET//\\/\\\\}
SERVICE_PRINCIPAL_CLIENT_SECRET=${SERVICE_PRINCIPAL_CLIENT_SECRET//\"/\\\"}
rm "$SP_FILE" # unneeded after reading from disk.
cat << EOF > "${AZURE_JSON_PATH}"
{
"cloud": "${TARGET_CLOUD}",
"tenantId": "${TENANT_ID}",
"subscriptionId": "${SUBSCRIPTION_ID}",
"aadClientId": "${SERVICE_PRINCIPAL_CLIENT_ID}",
"aadClientSecret": "${SERVICE_PRINCIPAL_CLIENT_SECRET}",
"resourceGroup": "${RESOURCE_GROUP}",
"location": "${LOCATION}",
"vmType": "${VM_TYPE}",
"subnetName": "${SUBNET}",
"securityGroupName": "${NETWORK_SECURITY_GROUP}",
"vnetName": "${VIRTUAL_NETWORK}",
"vnetResourceGroup": "${VIRTUAL_NETWORK_RESOURCE_GROUP}",
"routeTableName": "${ROUTE_TABLE}",
"primaryAvailabilitySetName": "${PRIMARY_AVAILABILITY_SET}",
"primaryScaleSetName": "${PRIMARY_SCALE_SET}",
"cloudProviderBackoffMode": "${CLOUDPROVIDER_BACKOFF_MODE}",
"cloudProviderBackoff": ${CLOUDPROVIDER_BACKOFF},
"cloudProviderBackoffRetries": ${CLOUDPROVIDER_BACKOFF_RETRIES},
"cloudProviderBackoffExponent": ${CLOUDPROVIDER_BACKOFF_EXPONENT},
"cloudProviderBackoffDuration": ${CLOUDPROVIDER_BACKOFF_DURATION},
"cloudProviderBackoffJitter": ${CLOUDPROVIDER_BACKOFF_JITTER},
"cloudProviderRateLimit": ${CLOUDPROVIDER_RATELIMIT},
"cloudProviderRateLimitQPS": ${CLOUDPROVIDER_RATELIMIT_QPS},
"cloudProviderRateLimitBucket": ${CLOUDPROVIDER_RATELIMIT_BUCKET},
"cloudProviderRateLimitQPSWrite": ${CLOUDPROVIDER_RATELIMIT_QPS_WRITE},
"cloudProviderRateLimitBucketWrite": ${CLOUDPROVIDER_RATELIMIT_BUCKET_WRITE},
"useManagedIdentityExtension": ${USE_MANAGED_IDENTITY_EXTENSION},
"userAssignedIdentityID": "${USER_ASSIGNED_IDENTITY_ID}",
"useInstanceMetadata": ${USE_INSTANCE_METADATA},
"loadBalancerSku": "${LOAD_BALANCER_SKU}",
"disableOutboundSNAT": ${LOAD_BALANCER_DISABLE_OUTBOUND_SNAT},
"excludeMasterFromStandardLB": ${EXCLUDE_MASTER_FROM_STANDARD_LB},
"providerVaultName": "${KMS_PROVIDER_VAULT_NAME}",
"maximumLoadBalancerRuleCount": ${MAXIMUM_LOADBALANCER_RULE_COUNT},
"providerKeyName": "k8s",
"providerKeyVersion": ""
}
EOF
set -x
if [[ "${CLOUDPROVIDER_BACKOFF_MODE}" = "v2" ]]; then
sed -i "/cloudProviderBackoffExponent/d" /etc/kubernetes/azure.json
sed -i "/cloudProviderBackoffJitter/d" /etc/kubernetes/azure.json
fi
configureKubeletServerCert
if [ "${IS_CUSTOM_CLOUD}" == "true" ]; then
set +x
AKS_CUSTOM_CLOUD_JSON_PATH="/etc/kubernetes/${TARGET_ENVIRONMENT}.json"
touch "${AKS_CUSTOM_CLOUD_JSON_PATH}"
chmod 0600 "${AKS_CUSTOM_CLOUD_JSON_PATH}"
chown root:root "${AKS_CUSTOM_CLOUD_JSON_PATH}"
echo "${CUSTOM_ENV_JSON}" | base64 -d > "${AKS_CUSTOM_CLOUD_JSON_PATH}"
set -x
fi
if [ "${KUBELET_CONFIG_FILE_ENABLED}" == "true" ]; then
set +x
KUBELET_CONFIG_JSON_PATH="/etc/default/kubeletconfig.json"
touch "${KUBELET_CONFIG_JSON_PATH}"
chmod 0600 "${KUBELET_CONFIG_JSON_PATH}"
chown root:root "${KUBELET_CONFIG_JSON_PATH}"
echo "${KUBELET_CONFIG_FILE_CONTENT}" | base64 -d > "${KUBELET_CONFIG_JSON_PATH}"
set -x
KUBELET_CONFIG_DROP_IN="/etc/systemd/system/kubelet.service.d/10-componentconfig.conf"
touch "${KUBELET_CONFIG_DROP_IN}"
chmod 0600 "${KUBELET_CONFIG_DROP_IN}"
tee "${KUBELET_CONFIG_DROP_IN}" > /dev/null <<EOF
[Service]
Environment="KUBELET_CONFIG_FILE_FLAGS=--config /etc/default/kubeletconfig.json"
EOF
fi
}
configureCNI() {
# needed for the iptables rules to work on bridges
retrycmd_if_failure 120 5 25 modprobe br_netfilter || exit $ERR_MODPROBE_FAIL
echo -n "br_netfilter" > /etc/modules-load.d/br_netfilter.conf
configureCNIIPTables
}
configureCNIIPTables() {
if [[ "${NETWORK_PLUGIN}" = "azure" ]]; then
mv $CNI_BIN_DIR/10-azure.conflist $CNI_CONFIG_DIR/
chmod 600 $CNI_CONFIG_DIR/10-azure.conflist
if [[ "${NETWORK_POLICY}" == "calico" ]]; then
sed -i 's#"mode":"bridge"#"mode":"transparent"#g' $CNI_CONFIG_DIR/10-azure.conflist
elif [[ "${NETWORK_POLICY}" == "" || "${NETWORK_POLICY}" == "none" ]] && [[ "${NETWORK_MODE}" == "transparent" ]]; then
sed -i 's#"mode":"bridge"#"mode":"transparent"#g' $CNI_CONFIG_DIR/10-azure.conflist
fi
/sbin/ebtables -t nat --list
fi
}
disableSystemdResolved() {
ls -ltr /etc/resolv.conf
cat /etc/resolv.conf
UBUNTU_RELEASE=$(lsb_release -r -s)
if [[ "${UBUNTU_RELEASE}" == "18.04" || "${UBUNTU_RELEASE}" == "20.04" || "${UBUNTU_RELEASE}" == "22.04" ]]; then
echo "Ingorings systemd-resolved query service but using its resolv.conf file"
echo "This is the simplest approach to workaround resolved issues without completely uninstall it"
[ -f /run/systemd/resolve/resolv.conf ] && sudo ln -sf /run/systemd/resolve/resolv.conf /etc/resolv.conf
ls -ltr /etc/resolv.conf
cat /etc/resolv.conf
fi
}
ensureContainerd() {
if [ "${TELEPORT_ENABLED}" == "true" ]; then
ensureTeleportd
fi
mkdir -p "/etc/systemd/system/containerd.service.d"
tee "/etc/systemd/system/containerd.service.d/exec_start.conf" > /dev/null <<EOF
[Service]
ExecStartPost=/sbin/iptables -P FORWARD ACCEPT
EOF
if [ "${ARTIFACT_STREAMING_ENABLED}" == "true" ]; then
logs_to_events "AKS.CSE.ensureContainerd.ensureArtifactStreaming" ensureArtifactStreaming || exit $ERR_ARTIFACT_STREAMING_INSTALL
fi
mkdir -p /etc/containerd
if [[ "${GPU_NODE}" = true ]] && [[ "${skip_nvidia_driver_install}" == "true" ]]; then
echo "Generating non-GPU containerd config for GPU node due to VM tags"
echo "${CONTAINERD_CONFIG_NO_GPU_CONTENT}" | base64 -d > /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT
else
echo "Generating containerd config..."
echo "${CONTAINERD_CONFIG_CONTENT}" | base64 -d > /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT
fi
tee "/etc/sysctl.d/99-force-bridge-forward.conf" > /dev/null <<EOF
net.ipv4.ip_forward = 1
net.ipv4.conf.all.forwarding = 1
net.ipv6.conf.all.forwarding = 1
net.bridge.bridge-nf-call-iptables = 1
EOF
retrycmd_if_failure 120 5 25 sysctl --system || exit $ERR_SYSCTL_RELOAD
systemctl is-active --quiet docker && (systemctl_disable 20 30 120 docker || exit $ERR_SYSTEMD_DOCKER_STOP_FAIL)
systemctlEnableAndStart containerd || exit $ERR_SYSTEMCTL_START_FAIL
}
ensureNoDupOnPromiscuBridge() {
systemctlEnableAndStart ensure-no-dup || exit $ERR_SYSTEMCTL_START_FAIL
}
ensureTeleportd() {
systemctlEnableAndStart teleportd || exit $ERR_SYSTEMCTL_START_FAIL
}
ensureArtifactStreaming() {
systemctl enable acr-mirror.service
systemctl start acr-mirror.service
sudo /opt/acr/tools/overlaybd/install.sh
sudo /opt/acr/tools/overlaybd/enable-http-auth.sh
modprobe target_core_user
curl -X PUT 'localhost:8578/config?ns=_default&enable_suffix=azurecr.io&stream_format=overlaybd' -O
systemctl enable /opt/overlaybd/overlaybd-tcmu.service
systemctl enable /opt/overlaybd/snapshotter/overlaybd-snapshotter.service
systemctl start overlaybd-tcmu
systemctl start overlaybd-snapshotter
systemctl start acr-nodemon
}
ensureDocker() {
DOCKER_SERVICE_EXEC_START_FILE=/etc/systemd/system/docker.service.d/exec_start.conf
usermod -aG docker ${ADMINUSER}
DOCKER_MOUNT_FLAGS_SYSTEMD_FILE=/etc/systemd/system/docker.service.d/clear_mount_propagation_flags.conf
DOCKER_JSON_FILE=/etc/docker/daemon.json
for i in $(seq 1 1200); do
if [ -s $DOCKER_JSON_FILE ]; then
jq '.' < $DOCKER_JSON_FILE && break
fi
if [ $i -eq 1200 ]; then
exit $ERR_FILE_WATCH_TIMEOUT
else
sleep 1
fi
done
systemctl is-active --quiet containerd && (systemctl_disable 20 30 120 containerd || exit $ERR_SYSTEMD_CONTAINERD_STOP_FAIL)
systemctlEnableAndStart docker || exit $ERR_DOCKER_START_FAIL
}
ensureDHCPv6() {
systemctlEnableAndStart dhcpv6 || exit $ERR_SYSTEMCTL_START_FAIL
retrycmd_if_failure 120 5 25 modprobe ip6_tables || exit $ERR_MODPROBE_FAIL
}
ensureKubelet() {
KUBELET_DEFAULT_FILE=/etc/default/kubelet
mkdir -p /etc/default
# In k8s >= 1.29 kubelet no longer sets node internalIP when using external cloud provider
# https://github.com/kubernetes/kubernetes/pull/121028
# This regresses node startup performance in Azure CNI Overlay and Podsubnet clusters, which require the node to be
# assigned an internal IP before configuring pod networking.
# To improve node startup performance, explicitly set `--node-ip` to the IP returned from IMDS so kubelet sets
# the internal IP when it registers the node.
# If this fails, skip setting --node-ip, which is safe because cloud-node-manager will assign it later anyway.
if semverCompare ${KUBERNETES_VERSION:-"0.0.0"} "1.29.0"; then
logs_to_events "AKS.CSE.ensureKubelet.setKubeletNodeIPFlag" setKubeletNodeIPFlag
fi
echo "KUBELET_FLAGS=${KUBELET_FLAGS}" > "${KUBELET_DEFAULT_FILE}"
echo "KUBELET_REGISTER_SCHEDULABLE=true" >> "${KUBELET_DEFAULT_FILE}"
echo "NETWORK_POLICY=${NETWORK_POLICY}" >> "${KUBELET_DEFAULT_FILE}"
echo "KUBELET_IMAGE=${KUBELET_IMAGE}" >> "${KUBELET_DEFAULT_FILE}"
echo "KUBELET_NODE_LABELS=${KUBELET_NODE_LABELS}" >> "${KUBELET_DEFAULT_FILE}"
if [ -n "${AZURE_ENVIRONMENT_FILEPATH}" ]; then
echo "AZURE_ENVIRONMENT_FILEPATH=${AZURE_ENVIRONMENT_FILEPATH}" >> "${KUBELET_DEFAULT_FILE}"
fi
KUBE_CA_FILE="/etc/kubernetes/certs/ca.crt"
mkdir -p "$(dirname "${KUBE_CA_FILE}")"
echo "${KUBE_CA_CRT}" | base64 -d > "${KUBE_CA_FILE}"
chmod 0600 "${KUBE_CA_FILE}"
if [ "${ENABLE_TLS_BOOTSTRAPPING}" == "true" ]; then
KUBELET_TLS_DROP_IN="/etc/systemd/system/kubelet.service.d/10-tlsbootstrap.conf"
mkdir -p "$(dirname "${KUBELET_TLS_DROP_IN}")"
touch "${KUBELET_TLS_DROP_IN}"
chmod 0600 "${KUBELET_TLS_DROP_IN}"
tee "${KUBELET_TLS_DROP_IN}" > /dev/null <<EOF
[Service]
Environment="KUBELET_TLS_BOOTSTRAP_FLAGS=--kubeconfig /var/lib/kubelet/kubeconfig --bootstrap-kubeconfig /var/lib/kubelet/bootstrap-kubeconfig"
EOF
BOOTSTRAP_KUBECONFIG_FILE=/var/lib/kubelet/bootstrap-kubeconfig
mkdir -p "$(dirname "${BOOTSTRAP_KUBECONFIG_FILE}")"
touch "${BOOTSTRAP_KUBECONFIG_FILE}"
chmod 0644 "${BOOTSTRAP_KUBECONFIG_FILE}"
tee "${BOOTSTRAP_KUBECONFIG_FILE}" > /dev/null <<EOF
apiVersion: v1
kind: Config
clusters:
- name: localcluster
cluster:
certificate-authority: /etc/kubernetes/certs/ca.crt
server: https://${API_SERVER_NAME}:443
users:
- name: kubelet-bootstrap
user:
token: "${TLS_BOOTSTRAP_TOKEN}"
contexts:
- context:
cluster: localcluster
user: kubelet-bootstrap
name: bootstrap-context
current-context: bootstrap-context
EOF
else
KUBECONFIG_FILE=/var/lib/kubelet/kubeconfig
mkdir -p "$(dirname "${KUBECONFIG_FILE}")"
touch "${KUBECONFIG_FILE}"
chmod 0644 "${KUBECONFIG_FILE}"
tee "${KUBECONFIG_FILE}" > /dev/null <<EOF
apiVersion: v1
kind: Config
clusters:
- name: localcluster
cluster:
certificate-authority: /etc/kubernetes/certs/ca.crt
server: https://${API_SERVER_NAME}:443
users:
- name: client
user:
client-certificate: /etc/kubernetes/certs/client.crt
client-key: /etc/kubernetes/certs/client.key
contexts:
- context:
cluster: localcluster
user: client
name: localclustercontext
current-context: localclustercontext
EOF
fi
KUBELET_RUNTIME_CONFIG_SCRIPT_FILE=/opt/azure/containers/kubelet.sh
tee "${KUBELET_RUNTIME_CONFIG_SCRIPT_FILE}" > /dev/null <<EOF
#!/bin/bash
# Disallow container from reaching out to the special IP address 168.63.129.16
# for TCP protocol (which http uses)
#
# 168.63.129.16 contains protected settings that have priviledged info.
#
# The host can still reach 168.63.129.16 because it goes through the OUTPUT chain, not FORWARD.
#
# Note: we should not block all traffic to 168.63.129.16. For example UDP traffic is still needed
# for DNS.
iptables -I FORWARD -d 168.63.129.16 -p tcp --dport 80 -j DROP
EOF
systemctlEnableAndStart kubelet || exit $ERR_KUBELET_START_FAIL
}
ensureMigPartition(){
mkdir -p /etc/systemd/system/mig-partition.service.d/
touch /etc/systemd/system/mig-partition.service.d/10-mig-profile.conf
tee /etc/systemd/system/mig-partition.service.d/10-mig-profile.conf > /dev/null <<EOF
[Service]
Environment="GPU_INSTANCE_PROFILE=${GPU_INSTANCE_PROFILE}"
EOF
# this is expected to fail and work only on next reboot
# it MAY succeed, only due to unreliability of systemd
# service type=Simple, which does not exit non-zero
# on failure if ExecStart failed to invoke.
systemctlEnableAndStart mig-partition
}
ensureSysctl() {
SYSCTL_CONFIG_FILE=/etc/sysctl.d/999-sysctl-aks.conf
mkdir -p "$(dirname "${SYSCTL_CONFIG_FILE}")"
touch "${SYSCTL_CONFIG_FILE}"
chmod 0644 "${SYSCTL_CONFIG_FILE}"
echo "${SYSCTL_CONTENT}" | base64 -d > "${SYSCTL_CONFIG_FILE}"
retrycmd_if_failure 24 5 25 sysctl --system
}
ensureK8sControlPlane() {
if $REBOOTREQUIRED || [ "$NO_OUTBOUND" = "true" ]; then
return
fi
retrycmd_if_failure 120 5 25 $KUBECTL 2>/dev/null cluster-info || exit $ERR_K8S_RUNNING_TIMEOUT
}
createKubeManifestDir() {
KUBEMANIFESTDIR=/etc/kubernetes/manifests
mkdir -p $KUBEMANIFESTDIR
}
writeKubeConfig() {
KUBECONFIGDIR=/home/$ADMINUSER/.kube
KUBECONFIGFILE=$KUBECONFIGDIR/config
mkdir -p $KUBECONFIGDIR
touch $KUBECONFIGFILE
chown $ADMINUSER:$ADMINUSER $KUBECONFIGDIR
chown $ADMINUSER:$ADMINUSER $KUBECONFIGFILE
chmod 700 $KUBECONFIGDIR
chmod 600 $KUBECONFIGFILE
set +x
echo "
---
apiVersion: v1
clusters:
- cluster:
certificate-authority-data: \"$CA_CERTIFICATE\"
server: $KUBECONFIG_SERVER
name: \"$MASTER_FQDN\"
contexts:
- context:
cluster: \"$MASTER_FQDN\"
user: \"$MASTER_FQDN-admin\"
name: \"$MASTER_FQDN\"
current-context: \"$MASTER_FQDN\"
kind: Config
users:
- name: \"$MASTER_FQDN-admin\"
user:
client-certificate-data: \"$KUBECONFIG_CERTIFICATE\"
client-key-data: \"$KUBECONFIG_KEY\"
" > $KUBECONFIGFILE
set -x
}
configClusterAutoscalerAddon() {
CLUSTER_AUTOSCALER_ADDON_FILE=/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml
sed -i "s|<clientID>|$(echo $SERVICE_PRINCIPAL_CLIENT_ID | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE
sed -i "s|<clientSec>|$(echo $SERVICE_PRINCIPAL_CLIENT_SECRET | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE
sed -i "s|<subID>|$(echo $SUBSCRIPTION_ID | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE
sed -i "s|<tenantID>|$(echo $TENANT_ID | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE
sed -i "s|<rg>|$(echo $RESOURCE_GROUP | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE
}
configACIConnectorAddon() {
ACI_CONNECTOR_CREDENTIALS=$(printf "{\"clientId\": \"%s\", \"clientSecret\": \"%s\", \"tenantId\": \"%s\", \"subscriptionId\": \"%s\", \"activeDirectoryEndpointUrl\": \"https://login.microsoftonline.com\",\"resourceManagerEndpointUrl\": \"https://management.azure.com/\", \"activeDirectoryGraphResourceId\": \"https://graph.windows.net/\", \"sqlManagementEndpointUrl\": \"https://management.core.windows.net:8443/\", \"galleryEndpointUrl\": \"https://gallery.azure.com/\", \"managementEndpointUrl\": \"https://management.core.windows.net/\"}" "$SERVICE_PRINCIPAL_CLIENT_ID" "$SERVICE_PRINCIPAL_CLIENT_SECRET" "$TENANT_ID" "$SUBSCRIPTION_ID" | base64 -w 0)
openssl req -newkey rsa:4096 -new -nodes -x509 -days 3650 -keyout /etc/kubernetes/certs/aci-connector-key.pem -out /etc/kubernetes/certs/aci-connector-cert.pem -subj "/C=US/ST=CA/L=virtualkubelet/O=virtualkubelet/OU=virtualkubelet/CN=virtualkubelet"
ACI_CONNECTOR_KEY=$(base64 /etc/kubernetes/certs/aci-connector-key.pem -w0)
ACI_CONNECTOR_CERT=$(base64 /etc/kubernetes/certs/aci-connector-cert.pem -w0)
ACI_CONNECTOR_ADDON_FILE=/etc/kubernetes/addons/aci-connector-deployment.yaml
sed -i "s|<creds>|$ACI_CONNECTOR_CREDENTIALS|g" $ACI_CONNECTOR_ADDON_FILE
sed -i "s|<rgName>|$RESOURCE_GROUP|g" $ACI_CONNECTOR_ADDON_FILE
sed -i "s|<cert>|$ACI_CONNECTOR_CERT|g" $ACI_CONNECTOR_ADDON_FILE
sed -i "s|<key>|$ACI_CONNECTOR_KEY|g" $ACI_CONNECTOR_ADDON_FILE
}
configAzurePolicyAddon() {
AZURE_POLICY_ADDON_FILE=/etc/kubernetes/addons/azure-policy-deployment.yaml
sed -i "s|<resourceId>|/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP|g" $AZURE_POLICY_ADDON_FILE
}
configGPUDrivers() {
# install gpu driver
if [[ $OS == $UBUNTU_OS_NAME ]]; then
mkdir -p /opt/{actions,gpu}
if [[ "${CONTAINER_RUNTIME}" == "containerd" ]]; then
ctr image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install"
ret=$?
if [[ "$ret" != "0" ]]; then
echo "Failed to install GPU driver, exiting..."
exit $ERR_GPU_DRIVERS_START_FAIL
fi
ctr images rm --sync $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
else
bash -c "$DOCKER_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG install"
ret=$?
if [[ "$ret" != "0" ]]; then
echo "Failed to install GPU driver, exiting..."
exit $ERR_GPU_DRIVERS_START_FAIL
fi
docker rmi $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG
fi
elif isMarinerOrAzureLinux "$OS"; then
downloadGPUDrivers
installNvidiaContainerToolkit
enableNvidiaPersistenceMode
else
echo "os $OS not supported at this time. skipping configGPUDrivers"
exit 1
fi
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
retrycmd_if_failure 120 5 300 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
# Fix the NVIDIA /dev/char link issue
if isMarinerOrAzureLinux "$OS"; then
createNvidiaSymlinkToAllDeviceNodes
fi
# reload containerd/dockerd
if [[ "${CONTAINER_RUNTIME}" == "containerd" ]]; then
retrycmd_if_failure 120 5 25 pkill -SIGHUP containerd || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT
else
retrycmd_if_failure 120 5 25 pkill -SIGHUP dockerd || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT
fi
}
validateGPUDrivers() {
if [[ $(isARM64) == 1 ]]; then
# no GPU on ARM64
return
fi
retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL
which nvidia-smi
if [[ $? == 0 ]]; then
SMI_RESULT=$(retrycmd_if_failure 24 5 300 nvidia-smi)
else
SMI_RESULT=$(retrycmd_if_failure 24 5 300 $GPU_DEST/bin/nvidia-smi)
fi
SMI_STATUS=$?
if [[ $SMI_STATUS != 0 ]]; then
if [[ $SMI_RESULT == *"infoROM is corrupted"* ]]; then
exit $ERR_GPU_INFO_ROM_CORRUPTED
else
exit $ERR_GPU_DRIVERS_START_FAIL
fi
else
echo "gpu driver working fine"
fi
}
ensureGPUDrivers() {
if [[ $(isARM64) == 1 ]]; then
# no GPU on ARM64
return
fi
if [[ "${CONFIG_GPU_DRIVER_IF_NEEDED}" = true ]]; then
logs_to_events "AKS.CSE.ensureGPUDrivers.configGPUDrivers" configGPUDrivers
else
logs_to_events "AKS.CSE.ensureGPUDrivers.validateGPUDrivers" validateGPUDrivers
fi
if [[ $OS == $UBUNTU_OS_NAME ]]; then
logs_to_events "AKS.CSE.ensureGPUDrivers.nvidia-modprobe" "systemctlEnableAndStart nvidia-modprobe" || exit $ERR_GPU_DRIVERS_START_FAIL
fi
}
disableSSH() {
systemctlDisableAndStop ssh || exit $ERR_DISABLE_SSH
}
setKubeletNodeIPFlag() {
imdsOutput=$(curl -s -H Metadata:true --noproxy "*" --max-time 5 "http://169.254.169.254/metadata/instance/network/interface?api-version=2021-02-01" 2> /dev/null)
if [[ $? -eq 0 ]]; then
nodeIPAddrs=()
ipv4Addr=$(echo $imdsOutput | jq -r '.[0].ipv4.ipAddress[0].privateIpAddress // ""')
[ -n "$ipv4Addr" ] && nodeIPAddrs+=("$ipv4Addr")
ipv6Addr=$(echo $imdsOutput | jq -r '.[0].ipv6.ipAddress[0].privateIpAddress // ""')
[ -n "$ipv6Addr" ] && nodeIPAddrs+=("$ipv6Addr")
nodeIPArg=$(IFS=, ; echo "${nodeIPAddrs[*]}") # join, comma-separated
if [ -n "$nodeIPArg" ]; then
echo "Adding --node-ip=$nodeIPArg to kubelet flags"
KUBELET_FLAGS="$KUBELET_FLAGS --node-ip=$nodeIPArg"
fi
fi
}
#EOF

Просмотреть файл

@ -1,400 +0,0 @@
#!/bin/bash
# ERR_SYSTEMCTL_ENABLE_FAIL=3 Service could not be enabled by systemctl -- DEPRECATED
ERR_SYSTEMCTL_START_FAIL=4 # Service could not be started or enabled by systemctl
ERR_CLOUD_INIT_TIMEOUT=5 # Timeout waiting for cloud-init runcmd to complete
ERR_FILE_WATCH_TIMEOUT=6 # Timeout waiting for a file
ERR_HOLD_WALINUXAGENT=7 # Unable to place walinuxagent apt package on hold during install
ERR_RELEASE_HOLD_WALINUXAGENT=8 # Unable to release hold on walinuxagent apt package after install
ERR_APT_INSTALL_TIMEOUT=9 # Timeout installing required apt packages
ERR_DOCKER_INSTALL_TIMEOUT=20 # Timeout waiting for docker install
ERR_DOCKER_DOWNLOAD_TIMEOUT=21 # Timout waiting for docker downloads
ERR_DOCKER_KEY_DOWNLOAD_TIMEOUT=22 # Timeout waiting to download docker repo key
ERR_DOCKER_APT_KEY_TIMEOUT=23 # Timeout waiting for docker apt-key
ERR_DOCKER_START_FAIL=24 # Docker could not be started by systemctl
ERR_MOBY_APT_LIST_TIMEOUT=25 # Timeout waiting for moby apt sources
ERR_MS_GPG_KEY_DOWNLOAD_TIMEOUT=26 # Timeout waiting for MS GPG key download
ERR_MOBY_INSTALL_TIMEOUT=27 # Timeout waiting for moby-docker install
ERR_CONTAINERD_INSTALL_TIMEOUT=28 # Timeout waiting for moby-containerd install
ERR_RUNC_INSTALL_TIMEOUT=29 # Timeout waiting for moby-runc install
ERR_K8S_RUNNING_TIMEOUT=30 # Timeout waiting for k8s cluster to be healthy
ERR_K8S_DOWNLOAD_TIMEOUT=31 # Timeout waiting for Kubernetes downloads
ERR_KUBECTL_NOT_FOUND=32 # kubectl client binary not found on local disk
ERR_IMG_DOWNLOAD_TIMEOUT=33 # Timeout waiting for img download
ERR_KUBELET_START_FAIL=34 # kubelet could not be started by systemctl
ERR_DOCKER_IMG_PULL_TIMEOUT=35 # Timeout trying to pull a Docker image
ERR_CONTAINERD_CTR_IMG_PULL_TIMEOUT=36 # Timeout trying to pull a containerd image via cli tool ctr
ERR_CONTAINERD_CRICTL_IMG_PULL_TIMEOUT=37 # Timeout trying to pull a containerd image via cli tool crictl
ERR_CONTAINERD_INSTALL_FILE_NOT_FOUND=38 # Unable to locate containerd debian pkg file
ERR_CNI_DOWNLOAD_TIMEOUT=41 # Timeout waiting for CNI downloads
ERR_MS_PROD_DEB_DOWNLOAD_TIMEOUT=42 # Timeout waiting for https://packages.microsoft.com/config/ubuntu/16.04/packages-microsoft-prod.deb
ERR_MS_PROD_DEB_PKG_ADD_FAIL=43 # Failed to add repo pkg file
# ERR_FLEXVOLUME_DOWNLOAD_TIMEOUT=44 Failed to add repo pkg file -- DEPRECATED
ERR_ORAS_DOWNLOAD_ERROR=45 # Unable to install oras
ERR_SYSTEMD_INSTALL_FAIL=48 # Unable to install required systemd version
ERR_MODPROBE_FAIL=49 # Unable to load a kernel module using modprobe
ERR_OUTBOUND_CONN_FAIL=50 # Unable to establish outbound connection
ERR_K8S_API_SERVER_CONN_FAIL=51 # Unable to establish connection to k8s api serve
ERR_K8S_API_SERVER_DNS_LOOKUP_FAIL=52 # Unable to resolve k8s api server name
ERR_K8S_API_SERVER_AZURE_DNS_LOOKUP_FAIL=53 # Unable to resolve k8s api server name due to Azure DNS issue
ERR_KATA_KEY_DOWNLOAD_TIMEOUT=60 # Timeout waiting to download kata repo key
ERR_KATA_APT_KEY_TIMEOUT=61 # Timeout waiting for kata apt-key
ERR_KATA_INSTALL_TIMEOUT=62 # Timeout waiting for kata install
ERR_VHD_FILE_NOT_FOUND=65 # VHD log file not found on VM built from VHD distro (previously classified as exit code 124)
ERR_CONTAINERD_DOWNLOAD_TIMEOUT=70 # Timeout waiting for containerd downloads
ERR_RUNC_DOWNLOAD_TIMEOUT=71 # Timeout waiting for runc downloads
ERR_CUSTOM_SEARCH_DOMAINS_FAIL=80 # Unable to configure custom search domains
ERR_GPU_DOWNLOAD_TIMEOUT=83 # Timeout waiting for GPU driver download
ERR_GPU_DRIVERS_START_FAIL=84 # nvidia-modprobe could not be started by systemctl
ERR_GPU_DRIVERS_INSTALL_TIMEOUT=85 # Timeout waiting for GPU drivers install
ERR_GPU_DEVICE_PLUGIN_START_FAIL=86 # nvidia device plugin could not be started by systemctl
ERR_GPU_INFO_ROM_CORRUPTED=87 # info ROM corrupted error when executing nvidia-smi
ERR_SGX_DRIVERS_INSTALL_TIMEOUT=90 # Timeout waiting for SGX prereqs to download
ERR_SGX_DRIVERS_START_FAIL=91 # Failed to execute SGX driver binary
ERR_APT_DAILY_TIMEOUT=98 # Timeout waiting for apt daily updates
ERR_APT_UPDATE_TIMEOUT=99 # Timeout waiting for apt-get update to complete
ERR_CSE_PROVISION_SCRIPT_NOT_READY_TIMEOUT=100 # Timeout waiting for cloud-init to place this script on the vm
ERR_APT_DIST_UPGRADE_TIMEOUT=101 # Timeout waiting for apt-get dist-upgrade to complete
ERR_APT_PURGE_FAIL=102 # Error purging distro packages
ERR_SYSCTL_RELOAD=103 # Error reloading sysctl config
ERR_CIS_ASSIGN_ROOT_PW=111 # Error assigning root password in CIS enforcement
ERR_CIS_ASSIGN_FILE_PERMISSION=112 # Error assigning permission to a file in CIS enforcement
ERR_PACKER_COPY_FILE=113 # Error writing a file to disk during VHD CI
ERR_CIS_APPLY_PASSWORD_CONFIG=115 # Error applying CIS-recommended passwd configuration
ERR_SYSTEMD_DOCKER_STOP_FAIL=116 # Error stopping dockerd
ERR_CRICTL_DOWNLOAD_TIMEOUT=117 # Timeout waiting for crictl downloads
ERR_CRICTL_OPERATION_ERROR=118 # Error executing a crictl operation
ERR_CTR_OPERATION_ERROR=119 # Error executing a ctr containerd cli operation
# Azure Stack specific errors
ERR_AZURE_STACK_GET_ARM_TOKEN=120 # Error generating a token to use with Azure Resource Manager
ERR_AZURE_STACK_GET_NETWORK_CONFIGURATION=121 # Error fetching the network configuration for the node
ERR_AZURE_STACK_GET_SUBNET_PREFIX=122 # Error fetching the subnet address prefix for a subnet ID
# Error code 124 is returned when a `timeout` command times out, and --preserve-status is not specified: https://man7.org/linux/man-pages/man1/timeout.1.html
ERR_VHD_BUILD_ERROR=125 # Reserved for VHD CI exit conditions
ERR_SWAP_CREATE_FAIL=130 # Error allocating swap file
ERR_SWAP_CREATE_INSUFFICIENT_DISK_SPACE=131 # Error insufficient disk space for swap file creation
ERR_TELEPORTD_DOWNLOAD_ERR=150 # Error downloading teleportd binary
ERR_TELEPORTD_INSTALL_ERR=151 # Error installing teleportd binary
ERR_ARTIFACT_STREAMING_DOWNLOAD=152 # Error downloading mirror proxy and overlaybd components
ERR_ARTIFACT_STREAMING_INSTALL=153 # Error installing mirror proxy and overlaybd components
ERR_HTTP_PROXY_CA_CONVERT=160 # Error converting http proxy ca cert from pem to crt format
ERR_UPDATE_CA_CERTS=161 # Error updating ca certs to include user-provided certificates
ERR_DISBALE_IPTABLES=170 # Error disabling iptables service
ERR_KRUSTLET_DOWNLOAD_TIMEOUT=171 # Timeout waiting for krustlet downloads
ERR_DISABLE_SSH=172 # Error disabling ssh service
ERR_VHD_REBOOT_REQUIRED=200 # Reserved for VHD reboot required exit condition
ERR_NO_PACKAGES_FOUND=201 # Reserved for no security packages found exit condition
ERR_SYSTEMCTL_MASK_FAIL=2 # Service could not be masked by systemctl
OS=$(sort -r /etc/*-release | gawk 'match($0, /^(ID_LIKE=(coreos)|ID=(.*))$/, a) { print toupper(a[2] a[3]); exit }')
OS_VERSION=$(sort -r /etc/*-release | gawk 'match($0, /^(VERSION_ID=(.*))$/, a) { print toupper(a[2] a[3]); exit }' | tr -d '"')
UBUNTU_OS_NAME="UBUNTU"
MARINER_OS_NAME="MARINER"
AZURELINUX_OS_NAME="AZURELINUX"
KUBECTL=/usr/local/bin/kubectl
DOCKER=/usr/bin/docker
# this will be empty during VHD build
# but vhd build runs with `set -o nounset`
# so needs a default value
# prefer empty string to avoid potential "it works but did something weird" scenarios
export GPU_DV="${GPU_DRIVER_VERSION:=}"
export GPU_DEST=/usr/local/nvidia
NVIDIA_DOCKER_VERSION=2.8.0-1
DOCKER_VERSION=1.13.1-1
NVIDIA_CONTAINER_RUNTIME_VERSION="3.6.0"
export NVIDIA_DRIVER_IMAGE_SHA="sha-e8873b"
export NVIDIA_DRIVER_IMAGE_TAG="${GPU_DV}-${NVIDIA_DRIVER_IMAGE_SHA}"
export NVIDIA_DRIVER_IMAGE="mcr.microsoft.com/aks/aks-gpu"
export CTR_GPU_INSTALL_CMD="ctr run --privileged --rm --net-host --with-ns pid:/proc/1/ns/pid --mount type=bind,src=/opt/gpu,dst=/mnt/gpu,options=rbind --mount type=bind,src=/opt/actions,dst=/mnt/actions,options=rbind"
export DOCKER_GPU_INSTALL_CMD="docker run --privileged --net=host --pid=host -v /opt/gpu:/mnt/gpu -v /opt/actions:/mnt/actions --rm"
APT_CACHE_DIR=/var/cache/apt/archives/
PERMANENT_CACHE_DIR=/root/aptcache/
EVENTS_LOGGING_DIR=/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events/
CURL_OUTPUT=/tmp/curl_verbose.out
retrycmd_if_failure() {
retries=$1; wait_sleep=$2; timeout=$3; shift && shift && shift
for i in $(seq 1 $retries); do
timeout $timeout "${@}" && break || \
if [ $i -eq $retries ]; then
echo Executed \"$@\" $i times;
return 1
else
sleep $wait_sleep
fi
done
echo Executed \"$@\" $i times;
}
retrycmd_if_failure_no_stats() {
retries=$1; wait_sleep=$2; timeout=$3; shift && shift && shift
for i in $(seq 1 $retries); do
timeout $timeout ${@} && break || \
if [ $i -eq $retries ]; then
return 1
else
sleep $wait_sleep
fi
done
}
retrycmd_get_tarball() {
tar_retries=$1; wait_sleep=$2; tarball=$3; url=$4
echo "${tar_retries} retries"
for i in $(seq 1 $tar_retries); do
tar -tzf $tarball && break || \
if [ $i -eq $tar_retries ]; then
return 1
else
timeout 60 curl -fsSLv $url -o $tarball > $CURL_OUTPUT 2>&1
if [[ $? != 0 ]]; then
cat $CURL_OUTPUT
fi
sleep $wait_sleep
fi
done
}
retrycmd_get_tarball_from_registry_with_oras() {
tar_retries=$1; wait_sleep=$2; tarball=$3; url=$4
tar_folder=$(dirname "$tarball")
echo "${tar_retries} retries"
for i in $(seq 1 $tar_retries); do
tar -tzf $tarball && break || \
if [ $i -eq $tar_retries ]; then
return 1
else
# TODO: support private acr via kubelet identity
timeout 60 oras pull $url -o $tar_folder --registry-config ${ORAS_REGISTRY_CONFIG_FILE} > $ORAS_OUTPUT 2>&1
if [[ $? != 0 ]]; then
cat $ORAS_OUTPUT
fi
sleep $wait_sleep
fi
done
}
retrycmd_get_binary_from_registry_with_oras() {
binary_retries=$1; wait_sleep=$2; binary_path=$3; url=$4
binary_folder=$(dirname "$binary_path")
echo "${binary_retries} retries"
for i in $(seq 1 $binary_retries); do
if [ -f "$binary_path" ]; then
break
else
if [ $i -eq $binary_retries ]; then
return 1
else
# TODO: support private acr via kubelet identity
timeout 60 oras pull $url -o $binary_folder --registry-config ${ORAS_REGISTRY_CONFIG_FILE} > $ORAS_OUTPUT 2>&1
if [[ $? != 0 ]]; then
cat $ORAS_OUTPUT
fi
sleep $wait_sleep
fi
fi
done
}
retrycmd_curl_file() {
curl_retries=$1; wait_sleep=$2; timeout=$3; filepath=$4; url=$5
echo "${curl_retries} retries"
for i in $(seq 1 $curl_retries); do
[[ -f $filepath ]] && break
if [ $i -eq $curl_retries ]; then
return 1
else
timeout $timeout curl -fsSLv $url -o $filepath 2>&1 | tee $CURL_OUTPUT >/dev/null
if [[ $? != 0 ]]; then
cat $CURL_OUTPUT
fi
sleep $wait_sleep
fi
done
}
wait_for_file() {
retries=$1; wait_sleep=$2; filepath=$3
paved=/opt/azure/cloud-init-files.paved
grep -Fq "${filepath}" $paved && return 0
for i in $(seq 1 $retries); do
grep -Fq '#EOF' $filepath && break
if [ $i -eq $retries ]; then
return 1
else
sleep $wait_sleep
fi
done
sed -i "/#EOF/d" $filepath
echo $filepath >> $paved
}
systemctl_restart() {
retries=$1; wait_sleep=$2; timeout=$3 svcname=$4
for i in $(seq 1 $retries); do
timeout $timeout systemctl daemon-reload
timeout $timeout systemctl restart $svcname && break || \
if [ $i -eq $retries ]; then
return 1
else
systemctl status $svcname --no-pager -l
journalctl -u $svcname
sleep $wait_sleep
fi
done
}
systemctl_stop() {
retries=$1; wait_sleep=$2; timeout=$3 svcname=$4
for i in $(seq 1 $retries); do
timeout $timeout systemctl daemon-reload
timeout $timeout systemctl stop $svcname && break || \
if [ $i -eq $retries ]; then
return 1
else
sleep $wait_sleep
fi
done
}
systemctl_disable() {
retries=$1; wait_sleep=$2; timeout=$3 svcname=$4
for i in $(seq 1 $retries); do
timeout $timeout systemctl daemon-reload
timeout $timeout systemctl disable $svcname && break || \
if [ $i -eq $retries ]; then
return 1
else
sleep $wait_sleep
fi
done
}
sysctl_reload() {
retries=$1; wait_sleep=$2; timeout=$3
for i in $(seq 1 $retries); do
timeout $timeout sysctl --system && break || \
if [ $i -eq $retries ]; then
return 1
else
sleep $wait_sleep
fi
done
}
version_gte() {
test "$(printf '%s\n' "$@" | sort -rV | head -n 1)" == "$1"
}
systemctlEnableAndStart() {
systemctl_restart 100 5 30 $1
RESTART_STATUS=$?
systemctl status $1 --no-pager -l > /var/log/azure/$1-status.log
if [ $RESTART_STATUS -ne 0 ]; then
echo "$1 could not be started"
return 1
fi
if ! retrycmd_if_failure 120 5 25 systemctl enable $1; then
echo "$1 could not be enabled by systemctl"
return 1
fi
}
systemctlDisableAndStop() {
if systemctl list-units --full --all | grep -q "$1.service"; then
systemctl_stop 20 5 25 $1 || echo "$1 could not be stopped"
systemctl_disable 20 5 25 $1 || echo "$1 could not be disabled"
fi
}
# return true if a >= b
semverCompare() {
VERSION_A=$(echo $1 | cut -d "+" -f 1)
VERSION_B=$(echo $2 | cut -d "+" -f 1)
[[ "${VERSION_A}" == "${VERSION_B}" ]] && return 0
sorted=$(echo ${VERSION_A} ${VERSION_B} | tr ' ' '\n' | sort -V )
highestVersion=$(IFS= echo "${sorted}" | cut -d$'\n' -f2)
[[ "${VERSION_A}" == ${highestVersion} ]] && return 0
return 1
}
downloadDebPkgToFile() {
PKG_NAME=$1
PKG_VERSION=$2
PKG_DIRECTORY=$3
mkdir -p $PKG_DIRECTORY
# shellcheck disable=SC2164
pushd ${PKG_DIRECTORY}
retrycmd_if_failure 10 5 600 apt-get download ${PKG_NAME}=${PKG_VERSION}*
# shellcheck disable=SC2164
popd
}
apt_get_download() {
retries=$1; wait_sleep=$2; shift && shift;
local ret=0
pushd $APT_CACHE_DIR || return 1
for i in $(seq 1 $retries); do
dpkg --configure -a --force-confdef
wait_for_apt_locks
apt-get -o Dpkg::Options::=--force-confold download -y "${@}" && break
if [ $i -eq $retries ]; then ret=1; else sleep $wait_sleep; fi
done
popd || return 1
return $ret
}
getCPUArch() {
arch=$(uname -m)
if [[ ${arch,,} == "aarch64" || ${arch,,} == "arm64" ]]; then
echo "arm64"
else
echo "amd64"
fi
}
isARM64() {
if [[ $(getCPUArch) == "arm64" ]]; then
echo 1
else
echo 0
fi
}
logs_to_events() {
# local vars here allow for nested function tracking
# installContainerRuntime for example
local task=$1; shift
local eventsFileName=$(date +%s%3N)
local startTime=$(date +"%F %T.%3N")
${@}
ret=$?
local endTime=$(date +"%F %T.%3N")
# arg names are defined by GA and all these are required to be correctly read by GA
# EventPid, EventTid are required to be int. No use case for them at this point.
json_string=$( jq -n \
--arg Timestamp "${startTime}" \
--arg OperationId "${endTime}" \
--arg Version "1.23" \
--arg TaskName "${task}" \
--arg EventLevel "Informational" \
--arg Message "Completed: ${@}" \
--arg EventPid "0" \
--arg EventTid "0" \
'{Timestamp: $Timestamp, OperationId: $OperationId, Version: $Version, TaskName: $TaskName, EventLevel: $EventLevel, Message: $Message, EventPid: $EventPid, EventTid: $EventTid}'
)
echo ${json_string} > ${EVENTS_LOGGING_DIR}${eventsFileName}.json
# this allows an error from the command at ${@} to be returned and correct code assigned in cse_main
if [ "$ret" != "0" ]; then
return $ret
fi
}
should_skip_nvidia_drivers() {
set -x
body=$(curl -fsSL -H "Metadata: true" --noproxy "*" "http://169.254.169.254/metadata/instance?api-version=2021-02-01")
ret=$?
if [ "$ret" != "0" ]; then
return $ret
fi
should_skip=$(echo "$body" | jq -e '.compute.tagsList | map(select(.name | test("SkipGpuDriverInstall"; "i")))[0].value // "false" | test("true"; "i")')
echo "$should_skip" # true or false
}
#HELPERSEOF

Просмотреть файл

@ -1,531 +0,0 @@
#!/bin/bash
CC_SERVICE_IN_TMP=/opt/azure/containers/cc-proxy.service.in
CC_SOCKET_IN_TMP=/opt/azure/containers/cc-proxy.socket.in
CNI_CONFIG_DIR="/etc/cni/net.d"
CNI_BIN_DIR="/opt/cni/bin"
CNI_DOWNLOADS_DIR="/opt/cni/downloads"
CRICTL_DOWNLOAD_DIR="/opt/crictl/downloads"
CRICTL_BIN_DIR="/usr/local/bin"
CONTAINERD_DOWNLOADS_DIR="/opt/containerd/downloads"
RUNC_DOWNLOADS_DIR="/opt/runc/downloads"
K8S_DOWNLOADS_DIR="/opt/kubernetes/downloads"
UBUNTU_RELEASE=$(lsb_release -r -s)
OS=$(sort -r /etc/*-release | gawk 'match($0, /^(ID_LIKE=(coreos)|ID=(.*))$/, a) { print toupper(a[2] a[3]); exit }')
TELEPORTD_PLUGIN_DOWNLOAD_DIR="/opt/teleportd/downloads"
TELEPORTD_PLUGIN_BIN_DIR="/usr/local/bin"
CONTAINERD_WASM_VERSIONS="v0.3.0 v0.5.1 v0.8.0"
SPIN_KUBE_VERSIONS="v0.15.1"
MANIFEST_FILEPATH="/opt/azure/manifest.json"
MAN_DB_AUTO_UPDATE_FLAG_FILEPATH="/var/lib/man-db/auto-update"
CURL_OUTPUT=/tmp/curl_verbose.out
removeManDbAutoUpdateFlagFile() {
rm -f $MAN_DB_AUTO_UPDATE_FLAG_FILEPATH
}
createManDbAutoUpdateFlagFile() {
touch $MAN_DB_AUTO_UPDATE_FLAG_FILEPATH
}
cleanupContainerdDlFiles() {
rm -rf $CONTAINERD_DOWNLOADS_DIR
}
installContainerRuntime() {
if [ "${NEEDS_CONTAINERD}" == "true" ]; then
echo "in installContainerRuntime - KUBERNETES_VERSION = ${KUBERNETES_VERSION}"
local containerd_version
if [ -f "$MANIFEST_FILEPATH" ]; then
containerd_version="$(jq -r .containerd.edge "$MANIFEST_FILEPATH")"
if [ "${UBUNTU_RELEASE}" == "18.04" ]; then
containerd_version="$(jq -r '.containerd.pinned."1804"' "$MANIFEST_FILEPATH")"
fi
else
echo "WARNING: containerd version not found in manifest, defaulting to hardcoded."
fi
containerd_patch_version="$(echo "$containerd_version" | cut -d- -f1)"
containerd_revision="$(echo "$containerd_version" | cut -d- -f2)"
if [ -z "$containerd_patch_version" ] || [ "$containerd_patch_version" == "null" ] || [ "$containerd_revision" == "null" ]; then
echo "invalid container version: $containerd_version"
exit $ERR_CONTAINERD_INSTALL_TIMEOUT
fi
logs_to_events "AKS.CSE.installContainerRuntime.installStandaloneContainerd" "installStandaloneContainerd ${containerd_patch_version} ${containerd_revision}"
echo "in installContainerRuntime - CONTAINERD_VERION = ${containerd_patch_version}"
else
installMoby
fi
}
installNetworkPlugin() {
if [[ "${NETWORK_PLUGIN}" = "azure" ]]; then
installAzureCNI
fi
installCNI #reference plugins. Mostly for kubenet but loop back used by contaierd until containerd 2
rm -rf $CNI_DOWNLOADS_DIR &
}
wasmFilesExist() {
local containerd_wasm_filepath=${1}
local shim_version=${2}
local version_suffix=${3}
local shims_to_download=("${@:4}") # Capture all arguments starting from the fourth indx
local binary_version="$(echo "${shim_version}" | tr . -)"
for shim in "${shims_to_download[@]}"; do
if [ ! -f "${containerd_wasm_filepath}/containerd-shim-${shim}-${binary_version}-${version_suffix}" ]; then
return 1 # file is missing
fi
done
echo "all wasm files exist for ${containerd_wasm_filepath}/containerd-shim-*-${binary_version}-${version_suffix}"
return 0
}
# Install, download, update wasm must all be run from the same function call
# in order to ensure WASMSHIMPIDS persists correctly since in bash a new
# function call from install-dependnecies will create a new shell process.
installContainerdWasmShims(){
local download_location=${1}
PACKAGE_DOWNLOAD_URL=${2}
local package_versions=("${@:3}") # Capture all arguments starting from the third indx
for version in "${package_versions[@]}"; do
local shims_to_download=("spin" "slight")
if [[ "$version" == "0.8.0" ]]; then
shims_to_download+=("wws")
fi
containerd_wasm_url=$(evalPackageDownloadURL ${PACKAGE_DOWNLOAD_URL})
downloadContainerdWasmShims $download_location $containerd_wasm_url "v$version" "${shims_to_download[@]}" # adding v to version for simplicity
done
# wait for file downloads to complete before updating file permissions
wait ${WASMSHIMPIDS[@]}
for version in "${package_versions[@]}"; do
local shims_to_download=("spin" "slight")
if [[ "$version" == "0.8.0" ]]; then
shims_to_download+=("wws")
fi
updateContainerdWasmShimsPermissions $download_location "v$version" "${shims_to_download[@]}"
done
}
downloadContainerdWasmShims() {
local containerd_wasm_filepath=${1}
local containerd_wasm_url=${2}
local shim_version=${3}
local shims_to_download=("${@:4}") # Capture all arguments starting from the fourth indx
local binary_version="$(echo "${shim_version}" | tr . -)" # replaces . with - == 1.2.3 -> 1-2-3
if wasmFilesExist "$containerd_wasm_filepath" "$shim_version" "-v1" "${shims_to_download[@]}"; then
echo "containerd-wasm-shims already exists in $containerd_wasm_filepath, will not be downloading."
return
fi
# Oras download for WASM for Network Isolated Clusters
BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER="${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER:=}"
if [[ ! -z ${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER} ]]; then
local registry_url="${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER}/oss/binaries/deislabs/containerd-wasm-shims:${shim_version}-linux-${CPU_ARCH}"
local wasm_shims_tgz_tmp=$containerd_wasm_filepath/containerd-wasm-shims-linux-${CPU_ARCH}.tar.gz
retrycmd_get_tarball_from_registry_with_oras 120 5 "${wasm_shims_tgz_tmp}" ${registry_url} || exit $ERR_ORAS_PULL_CONTAINERD_WASM
tar -zxf "$wasm_shims_tgz_tmp" -C $containerd_wasm_filepath
mv "$containerd_wasm_filepath/containerd-shim-*-${shim_version}-v1" "$containerd_wasm_filepath/containerd-shim-*-${binary_version}-v1"
rm -f "$wasm_shims_tgz_tmp"
return
fi
for shim in "${shims_to_download[@]}"; do
retrycmd_if_failure 30 5 60 curl -fSLv -o "$containerd_wasm_filepath/containerd-shim-${shim}-${binary_version}-v1" "$containerd_wasm_url/containerd-shim-${shim}-v1" 2>&1 | tee $CURL_OUTPUT >/dev/null | grep -E "^(curl:.*)|([eE]rr.*)$" && (cat $CURL_OUTPUT && exit $ERR_KRUSTLET_DOWNLOAD_TIMEOUT) &
WASMSHIMPIDS+=($!)
done
}
updateContainerdWasmShimsPermissions() {
local containerd_wasm_filepath=${1}
local shim_version=${2}
local shims_to_download=("${@:3}") # Capture all arguments starting from the third indx
local binary_version="$(echo "${shim_version}" | tr . -)"
for shim in "${shims_to_download[@]}"; do
chmod 755 "$containerd_wasm_filepath/containerd-shim-${shim}-${binary_version}-v1"
done
}
installSpinKube(){
local download_location=${1}
PACKAGE_DOWNLOAD_URL=${2}
local package_versions=("${@:3}") # Capture all arguments starting from the third indx
for version in "${package_versions[@]}"; do
containerd_spinkube_url=$(evalPackageDownloadURL ${PACKAGE_DOWNLOAD_URL})
downloadSpinKube $download_location $containerd_spinkube_url "v$version" # adding v to version for simplicity
done
wait ${SPINKUBEPIDS[@]}
for version in "${package_versions[@]}"; do
chmod 755 "$download_location/containerd-shim-spin-v2"
done
}
downloadSpinKube(){
local containerd_spinkube_filepath=${1}
local containerd_spinkube_url=${2}
local shim_version=${3}
local shims_to_download=("${@:4}") # Capture all arguments starting from the fourth indx
if [ -f "$containerd_spinkube_filepath/containerd-shim-spin-v2" ]; then
echo "containerd-shim-spin-v2 already exists in $containerd_spinkube_filepath, will not be downloading."
return
fi
BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER="${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER:=}"
if [[ ! -z ${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER} ]]; then
local registry_url="${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER}/oss/binaries/spinkube/containerd-shim-spin:${shim_version}-linux-${CPU_ARCH}"
local wasm_shims_tgz_tmp="${containerd_spinkube_filepath}/containerd-shim-spin-v2"
retrycmd_get_binary_from_registry_with_oras 120 5 "${wasm_shims_tgz_tmp}" "${registry_url}" || exit $ERR_ORAS_PULL_CONTAINERD_WASM
rm -f "$wasm_shims_tgz_tmp"
return
fi
retrycmd_if_failure 30 5 60 curl -fSLv -o "$containerd_spinkube_filepath/containerd-shim-spin-v2" "$containerd_spinkube_url/containerd-shim-spin-v2" 2>&1 | tee $CURL_OUTPUT >/dev/null | grep -E "^(curl:.*)|([eE]rr.*)$" && (cat $CURL_OUTPUT && exit $ERR_KRUSTLET_DOWNLOAD_TIMEOUT) &
SPINKUBEPIDS+=($!)
}
downloadAzureCNI() {
mkdir -p $CNI_DOWNLOADS_DIR
CNI_TGZ_TMP=${VNET_CNI_PLUGINS_URL##*/} # Use bash builtin ## to remove all chars ("*") up to the final "/"
retrycmd_get_tarball 120 5 "$CNI_DOWNLOADS_DIR/${CNI_TGZ_TMP}" ${VNET_CNI_PLUGINS_URL} || exit $ERR_CNI_DOWNLOAD_TIMEOUT
}
downloadCrictl() {
CRICTL_VERSION=$1
CPU_ARCH=$(getCPUArch) #amd64 or arm64
mkdir -p $CRICTL_DOWNLOAD_DIR
CRICTL_DOWNLOAD_URL="https://acs-mirror.azureedge.net/cri-tools/v${CRICTL_VERSION}/binaries/crictl-v${CRICTL_VERSION}-linux-${CPU_ARCH}.tar.gz"
CRICTL_TGZ_TEMP=${CRICTL_DOWNLOAD_URL##*/}
retrycmd_curl_file 10 5 60 "$CRICTL_DOWNLOAD_DIR/${CRICTL_TGZ_TEMP}" ${CRICTL_DOWNLOAD_URL}
}
installCrictl() {
CPU_ARCH=$(getCPUArch) #amd64 or arm64
currentVersion=$(crictl --version 2>/dev/null | sed 's/crictl version //g')
if [[ "${currentVersion}" != "" ]]; then
echo "version ${currentVersion} of crictl already installed. skipping installCrictl of target version ${KUBERNETES_VERSION%.*}.0"
else
# this is only called during cse. VHDs should have crictl binaries pre-cached so no need to download.
# if the vhd does not have crictl pre-baked, return early
CRICTL_TGZ_TEMP="crictl-v${CRICTL_VERSION}-linux-${CPU_ARCH}.tar.gz"
if [[ ! -f "$CRICTL_DOWNLOAD_DIR/${CRICTL_TGZ_TEMP}" ]]; then
rm -rf ${CRICTL_DOWNLOAD_DIR}
echo "pre-cached crictl not found: skipping installCrictl"
return 1
fi
echo "Unpacking crictl into ${CRICTL_BIN_DIR}"
tar zxvf "$CRICTL_DOWNLOAD_DIR/${CRICTL_TGZ_TEMP}" -C ${CRICTL_BIN_DIR}
chown root:root $CRICTL_BIN_DIR/crictl
chmod 755 $CRICTL_BIN_DIR/crictl
fi
}
downloadTeleportdPlugin() {
DOWNLOAD_URL=$1
TELEPORTD_VERSION=$2
if [[ $(isARM64) == 1 ]]; then
# no arm64 teleport binaries according to owner
return
fi
if [[ -z ${DOWNLOAD_URL} ]]; then
echo "download url parameter for downloadTeleportdPlugin was not given"
exit $ERR_TELEPORTD_DOWNLOAD_ERR
fi
if [[ -z ${TELEPORTD_VERSION} ]]; then
echo "teleportd version not given"
exit $ERR_TELEPORTD_DOWNLOAD_ERR
fi
mkdir -p $TELEPORTD_PLUGIN_DOWNLOAD_DIR
retrycmd_curl_file 10 5 60 "${TELEPORTD_PLUGIN_DOWNLOAD_DIR}/teleportd-v${TELEPORTD_VERSION}" "${DOWNLOAD_URL}/v${TELEPORTD_VERSION}/teleportd" || exit ${ERR_TELEPORTD_DOWNLOAD_ERR}
}
installTeleportdPlugin() {
if [[ $(isARM64) == 1 ]]; then
# no arm64 teleport binaries according to owner
return
fi
CURRENT_VERSION=$(teleportd --version 2>/dev/null | sed 's/teleportd version v//g')
local TARGET_VERSION="0.8.0"
if semverCompare ${CURRENT_VERSION:-"0.0.0"} ${TARGET_VERSION}; then
echo "currently installed teleportd version ${CURRENT_VERSION} is greater than (or equal to) target base version ${TARGET_VERSION}. skipping installTeleportdPlugin."
else
downloadTeleportdPlugin ${TELEPORTD_PLUGIN_DOWNLOAD_URL} ${TARGET_VERSION}
mv "${TELEPORTD_PLUGIN_DOWNLOAD_DIR}/teleportd-v${TELEPORTD_VERSION}" "${TELEPORTD_PLUGIN_BIN_DIR}/teleportd" || exit ${ERR_TELEPORTD_INSTALL_ERR}
chmod 755 "${TELEPORTD_PLUGIN_BIN_DIR}/teleportd" || exit ${ERR_TELEPORTD_INSTALL_ERR}
fi
rm -rf ${TELEPORTD_PLUGIN_DOWNLOAD_DIR}
}
setupCNIDirs() {
mkdir -p $CNI_BIN_DIR
chown -R root:root $CNI_BIN_DIR
chmod -R 755 $CNI_BIN_DIR
mkdir -p $CNI_CONFIG_DIR
chown -R root:root $CNI_CONFIG_DIR
chmod 755 $CNI_CONFIG_DIR
}
# Reference CNI plugins is used by kubenet and the loopback plugin used by containerd 1.0 (dependency gone in 2.0)
# The version used to be deteremined by RP/toggle but are now just hadcoded in vhd as they rarely change and require a node image upgrade anyways
# Latest VHD should have the untar, older should have the tgz. And who knows will have neither.
installCNI() {
#always just use what is listed in components.json so we don't have to sync.
cniPackage=$(jq ".Packages" "$COMPONENTS_FILEPATH" | jq ".[] | select(.name == \"cni-plugins\")") || exit $ERR_CNI_VERSION_INVALID
#CNI doesn't really care about this but wanted to reuse updatePackageVersions which requires it.
os=${UBUNTU_OS_NAME}
if [[ -z "$UBUNTU_RELEASE" ]]; then
os=${OS}
os_version="current"
fi
os_version="${UBUNTU_RELEASE}"
PACKAGE_VERSIONS=()
updatePackageVersions "${cniPackage}" "${os}" "${os_version}"
#should change to ne
if [[ ${#PACKAGE_VERSIONS[@]} -gt 1 ]]; then
echo "WARNING: containerd package versions array has more than one element. Installing the last element in the array."
exit $ERR_CONTAINERD_VERSION_INVALID
fi
packageVersion=${PACKAGE_VERSIONS[0]}
# Is there a ${arch} variable I can use instead of the iff
if [[ $(isARM64) == 1 ]]; then
CNI_DIR_TMP="cni-plugins-linux-arm64-v${packageVersion}"
else
CNI_DIR_TMP="cni-plugins-linux-amd64-v${packageVersion}"
fi
if [[ -d "$CNI_DOWNLOADS_DIR/${CNI_DIR_TMP}" ]]; then
#not clear to me when this would ever happen. assume its related to the line above Latest VHD should have the untar, older should have the tgz.
mv ${CNI_DOWNLOADS_DIR}/${CNI_DIR_TMP}/* $CNI_BIN_DIR
else
echo "CNI tarball should already be unzipped by components.json"
exit $ERR_CNI_VERSION_INVALID
fi
chown -R root:root $CNI_BIN_DIR
}
installAzureCNI() {
CNI_TGZ_TMP=${VNET_CNI_PLUGINS_URL##*/} # Use bash builtin ## to remove all chars ("*") up to the final "/"
CNI_DIR_TMP=${CNI_TGZ_TMP%.tgz} # Use bash builtin % to remove the .tgz to look for a folder rather than tgz
# We want to use the untar azurecni reference first. And if that doesn't exist on the vhd does the tgz?
# And if tgz is already on the vhd then just untar into CNI_BIN_DIR
# Latest VHD should have the untar, older should have the tgz. And who knows will have neither.
if [[ -d "$CNI_DOWNLOADS_DIR/${CNI_DIR_TMP}" ]]; then
mv ${CNI_DOWNLOADS_DIR}/${CNI_DIR_TMP}/* $CNI_BIN_DIR
else
if [[ ! -f "$CNI_DOWNLOADS_DIR/${CNI_TGZ_TMP}" ]]; then
logs_to_events "AKS.CSE.installAzureCNI.downloadAzureCNI" downloadAzureCNI
fi
tar -xzf "$CNI_DOWNLOADS_DIR/${CNI_TGZ_TMP}" -C $CNI_BIN_DIR
fi
chown -R root:root $CNI_BIN_DIR
}
extractKubeBinaries() {
K8S_VERSION=$1
KUBE_BINARY_URL=$2
mkdir -p ${K8S_DOWNLOADS_DIR}
K8S_TGZ_TMP=${KUBE_BINARY_URL##*/}
retrycmd_get_tarball 120 5 "$K8S_DOWNLOADS_DIR/${K8S_TGZ_TMP}" ${KUBE_BINARY_URL} || exit $ERR_K8S_DOWNLOAD_TIMEOUT
tar --transform="s|.*|&-${K8S_VERSION}|" --show-transformed-names -xzvf "$K8S_DOWNLOADS_DIR/${K8S_TGZ_TMP}" \
--strip-components=3 -C /usr/local/bin kubernetes/node/bin/kubelet kubernetes/node/bin/kubectl
rm -f "$K8S_DOWNLOADS_DIR/${K8S_TGZ_TMP}"
}
installKubeletKubectlAndKubeProxy() {
CUSTOM_KUBE_BINARY_DOWNLOAD_URL="${CUSTOM_KUBE_BINARY_URL:=}"
if [[ ! -z ${CUSTOM_KUBE_BINARY_DOWNLOAD_URL} ]]; then
# remove the kubelet binaries to make sure the only binary left is from the CUSTOM_KUBE_BINARY_DOWNLOAD_URL
rm -rf /usr/local/bin/kubelet-* /usr/local/bin/kubectl-*
# NOTE(mainred): we expect kubelet binary to be under `kubernetes/node/bin`. This suits the current setting of
# kube binaries used by AKS and Kubernetes upstream.
# TODO(mainred): let's see if necessary to auto-detect the path of kubelet
logs_to_events "AKS.CSE.installKubeletKubectlAndKubeProxy.extractKubeBinaries" extractKubeBinaries ${KUBERNETES_VERSION} ${CUSTOM_KUBE_BINARY_DOWNLOAD_URL}
else
if [[ ! -f "/usr/local/bin/kubectl-${KUBERNETES_VERSION}" ]]; then
#TODO: remove the condition check on KUBE_BINARY_URL once RP change is released
if (($(echo ${KUBERNETES_VERSION} | cut -d"." -f2) >= 17)) && [ -n "${KUBE_BINARY_URL}" ]; then
logs_to_events "AKS.CSE.installKubeletKubectlAndKubeProxy.extractKubeBinaries" extractKubeBinaries ${KUBERNETES_VERSION} ${KUBE_BINARY_URL}
fi
fi
fi
mv "/usr/local/bin/kubelet-${KUBERNETES_VERSION}" "/usr/local/bin/kubelet"
mv "/usr/local/bin/kubectl-${KUBERNETES_VERSION}" "/usr/local/bin/kubectl"
chmod a+x /usr/local/bin/kubelet /usr/local/bin/kubectl
rm -rf /usr/local/bin/kubelet-* /usr/local/bin/kubectl-* /home/hyperkube-downloads &
}
pullContainerImage() {
CLI_TOOL=$1
CONTAINER_IMAGE_URL=$2
echo "pulling the image ${CONTAINER_IMAGE_URL} using ${CLI_TOOL}"
if [[ ${CLI_TOOL} == "ctr" ]]; then
logs_to_events "AKS.CSE.imagepullctr.${CONTAINER_IMAGE_URL}" "retrycmd_if_failure 2 1 120 ctr --namespace k8s.io image pull $CONTAINER_IMAGE_URL" || (echo "timed out pulling image ${CONTAINER_IMAGE_URL} via ctr" && exit $ERR_CONTAINERD_CTR_IMG_PULL_TIMEOUT)
elif [[ ${CLI_TOOL} == "crictl" ]]; then
logs_to_events "AKS.CSE.imagepullcrictl.${CONTAINER_IMAGE_URL}" "retrycmd_if_failure 2 1 120 crictl pull $CONTAINER_IMAGE_URL" || (echo "timed out pulling image ${CONTAINER_IMAGE_URL} via crictl" && exit $ERR_CONTAINERD_CRICTL_IMG_PULL_TIMEOUT)
else
logs_to_events "AKS.CSE.imagepull.${CONTAINER_IMAGE_URL}" "retrycmd_if_failure 2 1 120 docker pull $CONTAINER_IMAGE_URL" || (echo "timed out pulling image ${CONTAINER_IMAGE_URL} via docker" && exit $ERR_DOCKER_IMG_PULL_TIMEOUT)
fi
}
retagContainerImage() {
CLI_TOOL=$1
CONTAINER_IMAGE_URL=$2
RETAG_IMAGE_URL=$3
echo "retaging from ${CONTAINER_IMAGE_URL} to ${RETAG_IMAGE_URL} using ${CLI_TOOL}"
if [[ ${CLI_TOOL} == "ctr" ]]; then
ctr --namespace k8s.io image tag $CONTAINER_IMAGE_URL $RETAG_IMAGE_URL
elif [[ ${CLI_TOOL} == "crictl" ]]; then
crictl image tag $CONTAINER_IMAGE_URL $RETAG_IMAGE_URL
else
docker image tag $CONTAINER_IMAGE_URL $RETAG_IMAGE_URL
fi
}
retagMCRImagesForChina() {
# retag all the mcr for mooncake
if [[ "${CONTAINER_RUNTIME}" == "containerd" ]]; then
# shellcheck disable=SC2016
allMCRImages=($(ctr --namespace k8s.io images list | grep '^mcr.microsoft.com/' | awk '{print $1}'))
else
# shellcheck disable=SC2016
allMCRImages=($(docker images | grep '^mcr.microsoft.com/' | awk '{str = sprintf("%s:%s", $1, $2)} {print str}'))
fi
if [[ "${allMCRImages}" == "" ]]; then
echo "failed to find mcr images for retag"
return
fi
for mcrImage in ${allMCRImages[@]+"${allMCRImages[@]}"}; do
# in mooncake, the mcr endpoint is: mcr.azk8s.cn
# shellcheck disable=SC2001
retagMCRImage=$(echo ${mcrImage} | sed -e 's/^mcr.microsoft.com/mcr.azk8s.cn/g')
# can't use CLI_TOOL because crictl doesn't support retagging.
if [[ "${CONTAINER_RUNTIME}" == "containerd" ]]; then
retagContainerImage "ctr" ${mcrImage} ${retagMCRImage}
else
retagContainerImage "docker" ${mcrImage} ${retagMCRImage}
fi
done
}
removeContainerImage() {
CLI_TOOL=$1
CONTAINER_IMAGE_URL=$2
if [[ "${CLI_TOOL}" == "docker" ]]; then
docker image rm $CONTAINER_IMAGE_URL
else
# crictl should always be present
crictl rmi $CONTAINER_IMAGE_URL
fi
}
cleanUpImages() {
local targetImage=$1
export targetImage
function cleanupImagesRun() {
if [ "${NEEDS_CONTAINERD}" == "true" ]; then
if [[ "${CLI_TOOL}" == "crictl" ]]; then
images_to_delete=$(crictl images | awk '{print $1":"$2}' | grep -vE "${KUBERNETES_VERSION}$|${KUBERNETES_VERSION}.[0-9]+$|${KUBERNETES_VERSION}-|${KUBERNETES_VERSION}_" | grep ${targetImage} | tr ' ' '\n')
else
images_to_delete=$(ctr --namespace k8s.io images list | awk '{print $1}' | grep -vE "${KUBERNETES_VERSION}$|${KUBERNETES_VERSION}.[0-9]+$|${KUBERNETES_VERSION}-|${KUBERNETES_VERSION}_" | grep ${targetImage} | tr ' ' '\n')
fi
else
images_to_delete=$(docker images --format '{{OpenBraces}}.Repository{{CloseBraces}}:{{OpenBraces}}.Tag{{CloseBraces}}' | grep -vE "${KUBERNETES_VERSION}$|${KUBERNETES_VERSION}.[0-9]+$|${KUBERNETES_VERSION}-|${KUBERNETES_VERSION}_" | grep ${targetImage} | tr ' ' '\n')
fi
local exit_code=$?
if [[ $exit_code != 0 ]]; then
exit $exit_code
elif [[ "${images_to_delete}" != "" ]]; then
echo "${images_to_delete}" | while read image; do
if [ "${NEEDS_CONTAINERD}" == "true" ]; then
removeContainerImage ${CLI_TOOL} ${image}
else
removeContainerImage "docker" ${image}
fi
done
fi
}
export -f cleanupImagesRun
retrycmd_if_failure 10 5 120 bash -c cleanupImagesRun
}
cleanUpKubeProxyImages() {
echo $(date),$(hostname), startCleanUpKubeProxyImages
cleanUpImages "kube-proxy"
echo $(date),$(hostname), endCleanUpKubeProxyImages
}
cleanupRetaggedImages() {
if [[ "${TARGET_CLOUD}" != "AzureChinaCloud" ]]; then
if [ "${NEEDS_CONTAINERD}" == "true" ]; then
if [[ "${CLI_TOOL}" == "crictl" ]]; then
images_to_delete=$(crictl images | awk '{print $1":"$2}' | grep '^mcr.azk8s.cn/' | tr ' ' '\n')
else
images_to_delete=$(ctr --namespace k8s.io images list | awk '{print $1}' | grep '^mcr.azk8s.cn/' | tr ' ' '\n')
fi
else
images_to_delete=$(docker images --format '{{OpenBraces}}.Repository{{CloseBraces}}:{{OpenBraces}}.Tag{{CloseBraces}}' | grep '^mcr.azk8s.cn/' | tr ' ' '\n')
fi
if [[ "${images_to_delete}" != "" ]]; then
echo "${images_to_delete}" | while read image; do
if [ "${NEEDS_CONTAINERD}" == "true" ]; then
# always use ctr, even if crictl is installed.
# crictl will remove *ALL* references to a given imageID (SHA), which removes too much.
removeContainerImage "ctr" ${image}
else
removeContainerImage "docker" ${image}
fi
done
fi
else
echo "skipping container cleanup for AzureChinaCloud"
fi
}
cleanUpContainerImages() {
export KUBERNETES_VERSION
export CLI_TOOL
export -f retrycmd_if_failure
export -f removeContainerImage
export -f cleanUpImages
export -f cleanUpKubeProxyImages
bash -c cleanUpKubeProxyImages &
}
cleanUpContainerd() {
rm -Rf $CONTAINERD_DOWNLOADS_DIR
}
overrideNetworkConfig() {
CONFIG_FILEPATH="/etc/cloud/cloud.cfg.d/80_azure_net_config.cfg"
touch ${CONFIG_FILEPATH}
cat <<EOF >>${CONFIG_FILEPATH}
datasource:
Azure:
apply_network_config: false
EOF
}
#EOF

Просмотреть файл

@ -1,424 +0,0 @@
#!/bin/bash
# Timeout waiting for a file
ERR_FILE_WATCH_TIMEOUT=6
set -x
if [ -f /opt/azure/containers/provision.complete ]; then
echo "Already ran to success exiting..."
exit 0
fi
aptmarkWALinuxAgent hold &
# Setup logs for upload to host
LOG_DIR=/var/log/azure/aks
mkdir -p ${LOG_DIR}
ln -s /var/log/azure/cluster-provision.log \
/var/log/azure/cluster-provision-cse-output.log \
/opt/azure/*.json \
/opt/azure/cloud-init-files.paved \
/opt/azure/vhd-install.complete \
${LOG_DIR}/
# Redact the necessary secrets from cloud-config.txt so we don't expose any sensitive information
# when cloud-config.txt gets included within log bundles
python3 /opt/azure/containers/provision_redact_cloud_config.py \
--cloud-config-path /var/lib/cloud/instance/cloud-config.txt \
--output-path ${LOG_DIR}/cloud-config.txt
UBUNTU_RELEASE=$(lsb_release -r -s)
if [[ ${UBUNTU_RELEASE} == "16.04" ]]; then
sudo apt-get -y autoremove chrony
echo $?
sudo systemctl restart systemd-timesyncd
fi
echo $(date),$(hostname), startcustomscript>>/opt/m
for i in $(seq 1 3600); do
if [ -s "${CSE_HELPERS_FILEPATH}" ]; then
grep -Fq '#HELPERSEOF' "${CSE_HELPERS_FILEPATH}" && break
fi
if [ $i -eq 3600 ]; then
exit $ERR_FILE_WATCH_TIMEOUT
else
sleep 1
fi
done
sed -i "/#HELPERSEOF/d" "${CSE_HELPERS_FILEPATH}"
source "${CSE_HELPERS_FILEPATH}"
source "${CSE_DISTRO_HELPERS_FILEPATH}"
source "${CSE_INSTALL_FILEPATH}"
source "${CSE_DISTRO_INSTALL_FILEPATH}"
source "${CSE_CONFIG_FILEPATH}"
if [[ "${DISABLE_SSH}" == "true" ]]; then
disableSSH || exit $ERR_DISABLE_SSH
fi
# This involes using proxy, log the config before fetching packages
echo "private egress proxy address is '${PRIVATE_EGRESS_PROXY_ADDRESS}'"
# TODO update to use proxy
if [[ "${SHOULD_CONFIGURE_HTTP_PROXY}" == "true" ]]; then
if [[ "${SHOULD_CONFIGURE_HTTP_PROXY_CA}" == "true" ]]; then
configureHTTPProxyCA || exit $ERR_UPDATE_CA_CERTS
fi
configureEtcEnvironment
fi
if [[ "${SHOULD_CONFIGURE_CUSTOM_CA_TRUST}" == "true" ]]; then
configureCustomCaCertificate || exit $ERR_UPDATE_CA_CERTS
fi
if [[ -n "${OUTBOUND_COMMAND}" ]]; then
if [[ -n "${PROXY_VARS}" ]]; then
eval $PROXY_VARS
fi
retrycmd_if_failure 50 1 5 $OUTBOUND_COMMAND >> /var/log/azure/cluster-provision-cse-output.log 2>&1 || exit $ERR_OUTBOUND_CONN_FAIL;
fi
# Bring in OS-related vars
source /etc/os-release
# Mandb is not currently available on MarinerV1
if [[ ${ID} != "mariner" ]] && [[ ${ID} != "azurelinux" ]]; then
echo "Removing man-db auto-update flag file..."
logs_to_events "AKS.CSE.removeManDbAutoUpdateFlagFile" removeManDbAutoUpdateFlagFile
fi
export -f should_skip_nvidia_drivers
skip_nvidia_driver_install=$(retrycmd_if_failure_no_stats 10 1 10 bash -cx should_skip_nvidia_drivers)
ret=$?
if [[ "$ret" != "0" ]]; then
echo "Failed to determine if nvidia driver install should be skipped"
exit $ERR_NVIDIA_DRIVER_INSTALL
fi
if [[ "${GPU_NODE}" != "true" ]] || [[ "${skip_nvidia_driver_install}" == "true" ]]; then
logs_to_events "AKS.CSE.cleanUpGPUDrivers" cleanUpGPUDrivers
fi
logs_to_events "AKS.CSE.disableSystemdResolved" disableSystemdResolved
logs_to_events "AKS.CSE.configureAdminUser" configureAdminUser
VHD_LOGS_FILEPATH=/opt/azure/vhd-install.complete
if [ -f $VHD_LOGS_FILEPATH ]; then
echo "detected golden image pre-install"
logs_to_events "AKS.CSE.cleanUpContainerImages" cleanUpContainerImages
FULL_INSTALL_REQUIRED=false
else
if [[ "${IS_VHD}" = true ]]; then
echo "Using VHD distro but file $VHD_LOGS_FILEPATH not found"
exit $ERR_VHD_FILE_NOT_FOUND
fi
FULL_INSTALL_REQUIRED=true
fi
if [[ $OS == $UBUNTU_OS_NAME ]] && [ "$FULL_INSTALL_REQUIRED" = "true" ]; then
logs_to_events "AKS.CSE.installDeps" installDeps
else
echo "Golden image; skipping dependencies installation"
fi
logs_to_events "AKS.CSE.installContainerRuntime" installContainerRuntime
if [ "${NEEDS_CONTAINERD}" == "true" ] && [ "${TELEPORT_ENABLED}" == "true" ]; then
logs_to_events "AKS.CSE.installTeleportdPlugin" installTeleportdPlugin
fi
setupCNIDirs
logs_to_events "AKS.CSE.installNetworkPlugin" installNetworkPlugin
if [ "${IS_KRUSTLET}" == "true" ]; then
local versionsWasm=$(jq -r '.Packages[] | select(.name == "containerd-wasm-shims") | .downloadURIs.default.current.versionsV2[].latestVersion' "$COMPONENTS_FILEPATH")
local downloadLocationWasm=$(jq -r '.Packages[] | select(.name == "containerd-wasm-shims") | .downloadLocation' "$COMPONENTS_FILEPATH")
local downloadURLWasm=$(jq -r '.Packages[] | select(.name == "containerd-wasm-shims") | .downloadURIs.default.current.downloadURL' "$COMPONENTS_FILEPATH")
logs_to_events "AKS.CSE.installContainerdWasmShims" installContainerdWasmShims "$downloadLocationWasm" "$downloadURLWasm" "$versionsWasm"
local versionsSpinKube=$(jq -r '.Packages[] | select(.name == spinkube") | .downloadURIs.default.current.versionsV2[].latestVersion' "$COMPONENTS_FILEPATH")
local downloadLocationSpinKube=$(jq -r '.Packages[] | select(.name == "spinkube) | .downloadLocation' "$COMPONENTS_FILEPATH")
local downloadURLSpinKube=$(jq -r '.Packages[] | select(.name == "spinkube") | .downloadURIs.default.current.downloadURL' "$COMPONENTS_FILEPATH")
logs_to_events "AKS.CSE.installSpinKube" installSpinKube "$downloadURSpinKube" "$downloadLocationSpinKube" "$versionsSpinKube"
fi
# By default, never reboot new nodes.
REBOOTREQUIRED=false
echo $(date),$(hostname), "Start configuring GPU drivers"
if [[ "${GPU_NODE}" = true ]] && [[ "${skip_nvidia_driver_install}" != "true" ]]; then
logs_to_events "AKS.CSE.ensureGPUDrivers" ensureGPUDrivers
if [[ "${ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED}" = true ]]; then
if [[ "${MIG_NODE}" == "true" ]] && [[ -f "/etc/systemd/system/nvidia-device-plugin.service" ]]; then
mkdir -p "/etc/systemd/system/nvidia-device-plugin.service.d"
tee "/etc/systemd/system/nvidia-device-plugin.service.d/10-mig_strategy.conf" > /dev/null <<'EOF'
[Service]
Environment="MIG_STRATEGY=--mig-strategy single"
ExecStart=
ExecStart=/usr/local/nvidia/bin/nvidia-device-plugin $MIG_STRATEGY
EOF
fi
logs_to_events "AKS.CSE.start.nvidia-device-plugin" "systemctlEnableAndStart nvidia-device-plugin" || exit $ERR_GPU_DEVICE_PLUGIN_START_FAIL
else
logs_to_events "AKS.CSE.stop.nvidia-device-plugin" "systemctlDisableAndStop nvidia-device-plugin"
fi
if [[ "${GPU_NEEDS_FABRIC_MANAGER}" == "true" ]]; then
# fabric manager trains nvlink connections between multi instance gpus.
# it appears this is only necessary for systems with *multiple cards*.
# i.e., an A100 can be partitioned a maximum of 7 ways.
# An NC24ads_A100_v4 has one A100.
# An ND96asr_v4 has eight A100, for a maximum of 56 partitions.
# ND96 seems to require fabric manager *even when not using mig partitions*
# while it fails to install on NC24.
if isMarinerOrAzureLinux "$OS"; then
logs_to_events "AKS.CSE.installNvidiaFabricManager" installNvidiaFabricManager
fi
logs_to_events "AKS.CSE.nvidia-fabricmanager" "systemctlEnableAndStart nvidia-fabricmanager" || exit $ERR_GPU_DRIVERS_START_FAIL
fi
# This will only be true for multi-instance capable VM sizes
# for which the user has specified a partitioning profile.
# it is valid to use mig-capable gpus without a partitioning profile.
if [[ "${MIG_NODE}" == "true" ]]; then
# A100 GPU has a bit in the physical card (infoROM) to enable mig mode.
# Changing this bit in either direction requires a VM reboot on Azure (hypervisor/plaform stuff).
# Commands such as `nvidia-smi --gpu-reset` may succeed,
# while commands such as `nvidia-smi -q` will show mismatched current/pending mig mode.
# this will not be required per nvidia for next gen H100.
REBOOTREQUIRED=true
# this service applies the partitioning scheme with nvidia-smi.
# we should consider moving to mig-parted which is simpler/newer.
# we couldn't because of old drivers but that has long been fixed.
logs_to_events "AKS.CSE.ensureMigPartition" ensureMigPartition
fi
fi
echo $(date),$(hostname), "End configuring GPU drivers"
logs_to_events "AKS.CSE.installKubeletKubectlAndKubeProxy" installKubeletKubectlAndKubeProxy
createKubeManifestDir
if [ "${HAS_CUSTOM_SEARCH_DOMAIN}" == "true" ]; then
"${CUSTOM_SEARCH_DOMAIN_FILEPATH}" > /opt/azure/containers/setup-custom-search-domain.log 2>&1 || exit $ERR_CUSTOM_SEARCH_DOMAINS_FAIL
fi
# for drop ins, so they don't all have to check/create the dir
mkdir -p "/etc/systemd/system/kubelet.service.d"
logs_to_events "AKS.CSE.configureK8s" configureK8s
logs_to_events "AKS.CSE.configureCNI" configureCNI
# configure and enable dhcpv6 for dual stack feature
if [ "${IPV6_DUAL_STACK_ENABLED}" == "true" ]; then
logs_to_events "AKS.CSE.ensureDHCPv6" ensureDHCPv6
fi
if [ "${NEEDS_CONTAINERD}" == "true" ]; then
# containerd should not be configured until cni has been configured first
logs_to_events "AKS.CSE.ensureContainerd" ensureContainerd
else
logs_to_events "AKS.CSE.ensureDocker" ensureDocker
fi
if [[ "${MESSAGE_OF_THE_DAY}" != "" ]]; then
echo "${MESSAGE_OF_THE_DAY}" | base64 -d > /etc/motd
fi
# must run before kubelet starts to avoid race in container status using wrong image
# https://github.com/kubernetes/kubernetes/issues/51017
# can remove when fixed
if [[ "${TARGET_CLOUD}" == "AzureChinaCloud" ]]; then
retagMCRImagesForChina
fi
if [[ "${ENABLE_HOSTS_CONFIG_AGENT}" == "true" ]]; then
logs_to_events "AKS.CSE.configPrivateClusterHosts" configPrivateClusterHosts
fi
if [ "${SHOULD_CONFIG_TRANSPARENT_HUGE_PAGE}" == "true" ]; then
logs_to_events "AKS.CSE.configureTransparentHugePage" configureTransparentHugePage
fi
if [ "${SHOULD_CONFIG_SWAP_FILE}" == "true" ]; then
logs_to_events "AKS.CSE.configureSwapFile" configureSwapFile
fi
if [ "${NEEDS_CGROUPV2}" == "true" ]; then
tee "/etc/systemd/system/kubelet.service.d/10-cgroupv2.conf" > /dev/null <<EOF
[Service]
Environment="KUBELET_CGROUP_FLAGS=--cgroup-driver=systemd"
EOF
fi
if [ "${NEEDS_CONTAINERD}" == "true" ]; then
# gross, but the backticks make it very hard to do in Go
# TODO: move entirely into vhd.
# alternatively, can we verify this is safe with docker?
# or just do it even if not because docker is out of support?
mkdir -p /etc/containerd
echo "${KUBENET_TEMPLATE}" | base64 -d > /etc/containerd/kubenet_template.conf
# In k8s 1.27, the flag --container-runtime was removed.
# We now have 2 drop-in's, one with the still valid flags that will be applied to all k8s versions,
# the flags are --runtime-request-timeout, --container-runtime-endpoint, --runtime-cgroups
# For k8s >= 1.27, the flag --container-runtime will not be passed.
tee "/etc/systemd/system/kubelet.service.d/10-containerd-base-flag.conf" > /dev/null <<'EOF'
[Service]
Environment="KUBELET_CONTAINERD_FLAGS=--runtime-request-timeout=15m --container-runtime-endpoint=unix:///run/containerd/containerd.sock --runtime-cgroups=/system.slice/containerd.service"
EOF
# if k8s version < 1.27.0, add the drop in for --container-runtime flag
if ! semverCompare ${KUBERNETES_VERSION:-"0.0.0"} "1.27.0"; then
tee "/etc/systemd/system/kubelet.service.d/10-container-runtime-flag.conf" > /dev/null <<'EOF'
[Service]
Environment="KUBELET_CONTAINER_RUNTIME_FLAG=--container-runtime=remote"
EOF
fi
fi
if [ "${HAS_KUBELET_DISK_TYPE}" == "true" ]; then
tee "/etc/systemd/system/kubelet.service.d/10-bindmount.conf" > /dev/null <<EOF
[Unit]
Requires=bind-mount.service
After=bind-mount.service
EOF
fi
logs_to_events "AKS.CSE.ensureSysctl" ensureSysctl
if [ "${NEEDS_CONTAINERD}" == "true" ] && [ "${SHOULD_CONFIG_CONTAINERD_ULIMITS}" == "true" ]; then
logs_to_events "AKS.CSE.setContainerdUlimits" configureContainerdUlimits
fi
logs_to_events "AKS.CSE.ensureKubelet" ensureKubelet
if [ "${ENSURE_NO_DUPE_PROMISCUOUS_BRIDGE}" == "true" ]; then
logs_to_events "AKS.CSE.ensureNoDupOnPromiscuBridge" ensureNoDupOnPromiscuBridge
fi
if $FULL_INSTALL_REQUIRED; then
if [[ $OS == $UBUNTU_OS_NAME ]]; then
# mitigation for bug https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1676635
echo 2dd1ce17-079e-403c-b352-a1921ee207ee > /sys/bus/vmbus/drivers/hv_util/unbind
sed -i "13i\echo 2dd1ce17-079e-403c-b352-a1921ee207ee > /sys/bus/vmbus/drivers/hv_util/unbind\n" /etc/rc.local
fi
fi
VALIDATION_ERR=0
# TODO(djsly): Look at leveraging the `aks-check-network.sh` script for this validation instead of duplicating the logic here
# Edge case scenarios:
# high retry times to wait for new API server DNS record to replicate (e.g. stop and start cluster)
# high timeout to address high latency for private dns server to forward request to Azure DNS
# dns check will be done only if we use FQDN for API_SERVER_NAME
API_SERVER_CONN_RETRIES=50
if [[ $API_SERVER_NAME == *.privatelink.* ]]; then
API_SERVER_CONN_RETRIES=100
fi
if ! [[ ${API_SERVER_NAME} =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
API_SERVER_DNS_RETRIES=100
if [[ $API_SERVER_NAME == *.privatelink.* ]]; then
API_SERVER_DNS_RETRIES=200
fi
if [[ "${ENABLE_HOSTS_CONFIG_AGENT}" != "true" ]]; then
RES=$(logs_to_events "AKS.CSE.apiserverNslookup" "retrycmd_if_failure ${API_SERVER_DNS_RETRIES} 1 20 nslookup -timeout=5 -retry=0 ${API_SERVER_NAME}")
STS=$?
else
STS=0
fi
if [[ $STS != 0 ]]; then
time nslookup ${API_SERVER_NAME}
if [[ $RES == *"168.63.129.16"* ]]; then
VALIDATION_ERR=$ERR_K8S_API_SERVER_AZURE_DNS_LOOKUP_FAIL
else
VALIDATION_ERR=$ERR_K8S_API_SERVER_DNS_LOOKUP_FAIL
fi
else
if [ "${UBUNTU_RELEASE}" == "18.04" ]; then
#TODO (djsly): remove this once 18.04 isn't supported anymore
logs_to_events "AKS.CSE.apiserverNC" "retrycmd_if_failure ${API_SERVER_CONN_RETRIES} 1 10 nc -vz ${API_SERVER_NAME} 443" || time nc -vz ${API_SERVER_NAME} 443 || VALIDATION_ERR=$ERR_K8S_API_SERVER_CONN_FAIL
else
logs_to_events "AKS.CSE.apiserverCurl" "retrycmd_if_failure ${API_SERVER_CONN_RETRIES} 1 10 curl -v --cacert /etc/kubernetes/certs/ca.crt https://${API_SERVER_NAME}:443" || time curl -v --cacert /etc/kubernetes/certs/ca.crt "https://${API_SERVER_NAME}:443" || VALIDATION_ERR=$ERR_K8S_API_SERVER_CONN_FAIL
fi
fi
else
if [ "${UBUNTU_RELEASE}" == "18.04" ]; then
#TODO (djsly): remove this once 18.04 isn't supported anymore
logs_to_events "AKS.CSE.apiserverNC" "retrycmd_if_failure ${API_SERVER_CONN_RETRIES} 1 10 nc -vz ${API_SERVER_NAME} 443" || time nc -vz ${API_SERVER_NAME} 443 || VALIDATION_ERR=$ERR_K8S_API_SERVER_CONN_FAIL
else
logs_to_events "AKS.CSE.apiserverCurl" "retrycmd_if_failure ${API_SERVER_CONN_RETRIES} 1 10 curl -v --cacert /etc/kubernetes/certs/ca.crt https://${API_SERVER_NAME}:443" || time curl -v --cacert /etc/kubernetes/certs/ca.crt "https://${API_SERVER_NAME}:443" || VALIDATION_ERR=$ERR_K8S_API_SERVER_CONN_FAIL
fi
fi
if [[ ${ID} != "mariner" ]] && [[ ${ID} != "azurelinux" ]]; then
echo "Recreating man-db auto-update flag file and kicking off man-db update process at $(date)"
createManDbAutoUpdateFlagFile
/usr/bin/mandb && echo "man-db finished updates at $(date)" &
fi
if $REBOOTREQUIRED; then
echo 'reboot required, rebooting node in 1 minute'
/bin/bash -c "shutdown -r 1 &"
if [[ $OS == $UBUNTU_OS_NAME ]]; then
# logs_to_events should not be run on & commands
aptmarkWALinuxAgent unhold &
fi
else
if [[ $OS == $UBUNTU_OS_NAME ]]; then
# logs_to_events should not be run on & commands
if [ "${ENABLE_UNATTENDED_UPGRADES}" == "true" ]; then
UU_CONFIG_DIR="/etc/apt/apt.conf.d/99periodic"
mkdir -p "$(dirname "${UU_CONFIG_DIR}")"
touch "${UU_CONFIG_DIR}"
chmod 0644 "${UU_CONFIG_DIR}"
echo 'APT::Periodic::Update-Package-Lists "1";' >> "${UU_CONFIG_DIR}"
echo 'APT::Periodic::Unattended-Upgrade "1";' >> "${UU_CONFIG_DIR}"
systemctl unmask apt-daily.service apt-daily-upgrade.service
systemctl enable apt-daily.service apt-daily-upgrade.service
systemctl enable apt-daily.timer apt-daily-upgrade.timer
systemctl restart --no-block apt-daily.timer apt-daily-upgrade.timer
# this is the DOWNLOAD service
# meaning we are wasting IO without even triggering an upgrade
# -________________-
systemctl restart --no-block apt-daily.service
fi
aptmarkWALinuxAgent unhold &
elif isMarinerOrAzureLinux "$OS"; then
if [ "${ENABLE_UNATTENDED_UPGRADES}" == "true" ]; then
if [ "${IS_KATA}" == "true" ]; then
# Currently kata packages must be updated as a unit (including the kernel which requires a reboot). This can
# only be done reliably via image updates as of now so never enable automatic updates.
echo 'EnableUnattendedUpgrade is not supported by kata images, will not be enabled'
else
# By default the dnf-automatic is service is notify only in Mariner/AzureLinux.
# Enable the automatic install timer and the check-restart timer.
# Stop the notify only dnf timer since we've enabled the auto install one.
# systemctlDisableAndStop adds .service to the end which doesn't work on timers.
systemctl disable dnf-automatic-notifyonly.timer
systemctl stop dnf-automatic-notifyonly.timer
# At 6:00:00 UTC (1 hour random fuzz) download and install package updates.
systemctl unmask dnf-automatic-install.service || exit $ERR_SYSTEMCTL_START_FAIL
systemctl unmask dnf-automatic-install.timer || exit $ERR_SYSTEMCTL_START_FAIL
systemctlEnableAndStart dnf-automatic-install.timer || exit $ERR_SYSTEMCTL_START_FAIL
# The check-restart service which will inform kured of required restarts should already be running
fi
fi
fi
fi
echo "Custom script finished. API server connection check code:" $VALIDATION_ERR
echo $(date),$(hostname), endcustomscript>>/opt/m
mkdir -p /opt/azure/containers && touch /opt/azure/containers/provision.complete
exit $VALIDATION_ERR
#EOF

Просмотреть файл

@ -1,96 +0,0 @@
CSE_STARTTIME=$(date)
CSE_STARTTIME_FORMATTED=$(date +"%F %T.%3N")
timeout -k5s 15m /bin/bash /opt/azure/containers/provision.sh >> /var/log/azure/cluster-provision.log 2>&1
EXIT_CODE=$?
systemctl --no-pager -l status kubelet >> /var/log/azure/cluster-provision-cse-output.log 2>&1
OUTPUT=$(tail -c 3000 "/var/log/azure/cluster-provision.log")
KERNEL_STARTTIME=$(systemctl show -p KernelTimestamp | sed -e "s/KernelTimestamp=//g" || true)
KERNEL_STARTTIME_FORMATTED=$(date -d "${KERNEL_STARTTIME}" +"%F %T.%3N" )
CLOUDINITLOCAL_STARTTIME=$(systemctl show cloud-init-local -p ExecMainStartTimestamp | sed -e "s/ExecMainStartTimestamp=//g" || true)
CLOUDINITLOCAL_STARTTIME_FORMATTED=$(date -d "${CLOUDINITLOCAL_STARTTIME}" +"%F %T.%3N" )
CLOUDINIT_STARTTIME=$(systemctl show cloud-init -p ExecMainStartTimestamp | sed -e "s/ExecMainStartTimestamp=//g" || true)
CLOUDINIT_STARTTIME_FORMATTED=$(date -d "${CLOUDINIT_STARTTIME}" +"%F %T.%3N" )
CLOUDINITFINAL_STARTTIME=$(systemctl show cloud-final -p ExecMainStartTimestamp | sed -e "s/ExecMainStartTimestamp=//g" || true)
CLOUDINITFINAL_STARTTIME_FORMATTED=$(date -d "${CLOUDINITFINAL_STARTTIME}" +"%F %T.%3N" )
NETWORKD_STARTTIME=$(systemctl show systemd-networkd -p ExecMainStartTimestamp | sed -e "s/ExecMainStartTimestamp=//g" || true)
NETWORKD_STARTTIME_FORMATTED=$(date -d "${NETWORKD_STARTTIME}" +"%F %T.%3N" )
GUEST_AGENT_STARTTIME=$(systemctl show walinuxagent.service -p ExecMainStartTimestamp | sed -e "s/ExecMainStartTimestamp=//g" || true)
GUEST_AGENT_STARTTIME_FORMATTED=$(date -d "${GUEST_AGENT_STARTTIME}" +"%F %T.%3N" )
KUBELET_START_TIME=$(systemctl show kubelet.service -p ExecMainStartTimestamp | sed -e "s/ExecMainStartTimestamp=//g" || true)
KUBELET_START_TIME_FORMATTED=$(date -d "${KUBELET_START_TIME}" +"%F %T.%3N" )
KUBELET_READY_TIME_FORMATTED="$(date -d "$(journalctl -u kubelet | grep NodeReady | cut -d' ' -f1-3)" +"%F %T.%3N")"
SYSTEMD_SUMMARY=$(systemd-analyze || true)
CSE_ENDTIME_FORMATTED=$(date +"%F %T.%3N")
EVENTS_LOGGING_DIR=/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events/
EVENTS_FILE_NAME=$(date +%s%3N)
EXECUTION_DURATION=$(echo $(($(date +%s) - $(date -d "$CSE_STARTTIME" +%s))))
JSON_STRING=$( jq -n \
--arg ec "$EXIT_CODE" \
--arg op "$OUTPUT" \
--arg er "" \
--arg ed "$EXECUTION_DURATION" \
--arg ks "$KERNEL_STARTTIME" \
--arg cinitl "$CLOUDINITLOCAL_STARTTIME" \
--arg cinit "$CLOUDINIT_STARTTIME" \
--arg cf "$CLOUDINITFINAL_STARTTIME" \
--arg ns "$NETWORKD_STARTTIME" \
--arg cse "$CSE_STARTTIME" \
--arg ga "$GUEST_AGENT_STARTTIME" \
--arg ss "$SYSTEMD_SUMMARY" \
--arg kubelet "$KUBELET_START_TIME" \
'{ExitCode: $ec, Output: $op, Error: $er, ExecDuration: $ed, KernelStartTime: $ks, CloudInitLocalStartTime: $cinitl, CloudInitStartTime: $cinit, CloudFinalStartTime: $cf, NetworkdStartTime: $ns, CSEStartTime: $cse, GuestAgentStartTime: $ga, SystemdSummary: $ss, BootDatapoints: { KernelStartTime: $ks, CSEStartTime: $cse, GuestAgentStartTime: $ga, KubeletStartTime: $kubelet }}' )
mkdir -p /var/log/azure/aks
echo $JSON_STRING | tee /var/log/azure/aks/provision.json
# messsage_string is here because GA only accepts strings in Message.
message_string=$( jq -n \
--arg EXECUTION_DURATION "${EXECUTION_DURATION}" \
--arg EXIT_CODE "${EXIT_CODE}" \
--arg KERNEL_STARTTIME_FORMATTED "${KERNEL_STARTTIME_FORMATTED}" \
--arg CLOUDINITLOCAL_STARTTIME_FORMATTED "${CLOUDINITLOCAL_STARTTIME_FORMATTED}" \
--arg CLOUDINIT_STARTTIME_FORMATTED "${CLOUDINIT_STARTTIME_FORMATTED}" \
--arg CLOUDINITFINAL_STARTTIME_FORMATTED "${CLOUDINITFINAL_STARTTIME_FORMATTED}" \
--arg NETWORKD_STARTTIME_FORMATTED "${NETWORKD_STARTTIME_FORMATTED}" \
--arg GUEST_AGENT_STARTTIME_FORMATTED "${GUEST_AGENT_STARTTIME_FORMATTED}" \
--arg KUBELET_START_TIME_FORMATTED "${KUBELET_START_TIME_FORMATTED}" \
--arg KUBELET_READY_TIME_FORMATTED "${KUBELET_READY_TIME_FORMATTED}" \
'{ExitCode: $EXIT_CODE, E2E: $EXECUTION_DURATION, KernelStartTime: $KERNEL_STARTTIME_FORMATTED, CloudInitLocalStartTime: $CLOUDINITLOCAL_STARTTIME_FORMATTED, CloudInitStartTime: $CLOUDINIT_STARTTIME_FORMATTED, CloudFinalStartTime: $CLOUDINITFINAL_STARTTIME_FORMATTED, NetworkdStartTime: $NETWORKD_STARTTIME_FORMATTED, GuestAgentStartTime: $GUEST_AGENT_STARTTIME_FORMATTED, KubeletStartTime: $KUBELET_START_TIME_FORMATTED, KubeletReadyTime: $KUBELET_READY_TIME_FORMATTED } | tostring'
)
# this clean up brings me no joy, but removing extra "\" and then removing quotes at the end of the string
# allows parsing to happening without additional manipulation
message_string=$(echo $message_string | sed 's/\\//g' | sed 's/^.\(.*\).$/\1/')
# arg names are defined by GA and all these are required to be correctly read by GA
# EventPid, EventTid are required to be int. No use case for them at this point.
EVENT_JSON=$( jq -n \
--arg Timestamp "${CSE_STARTTIME_FORMATTED}" \
--arg OperationId "${CSE_ENDTIME_FORMATTED}" \
--arg Version "1.23" \
--arg TaskName "AKS.CSE.cse_start" \
--arg EventLevel "${eventlevel}" \
--arg Message "${message_string}" \
--arg EventPid "0" \
--arg EventTid "0" \
'{Timestamp: $Timestamp, OperationId: $OperationId, Version: $Version, TaskName: $TaskName, EventLevel: $EventLevel, Message: $Message, EventPid: $EventPid, EventTid: $EventTid}'
)
echo ${EVENT_JSON} > ${EVENTS_LOGGING_DIR}${EVENTS_FILE_NAME}.json
# force a log upload to the host after the provisioning script finishes
# if we failed, wait for the upload to complete so that we don't remove
# the VM before it finishes. if we succeeded, upload in the background
# so that the provisioning script returns success more quickly
upload_logs() {
# find the most recent version of WALinuxAgent and use it to collect logs per
# https://supportability.visualstudio.com/AzureIaaSVM/_wiki/wikis/AzureIaaSVM/495009/Log-Collection_AGEX?anchor=manually-collect-logs
PYTHONPATH=$(find /var/lib/waagent -name WALinuxAgent\*.egg | sort -rV | head -n1)
python3 $PYTHONPATH -collect-logs -full >/dev/null 2>&1
python3 /opt/azure/containers/provision_send_logs.py >/dev/null 2>&1
}
if [ $EXIT_CODE -ne 0 ]; then
upload_logs
else
upload_logs &
fi
exit $EXIT_CODE