batch-shipyard/scripts/shipyard_nodeprep.sh

959 строки
33 KiB
Bash
Executable File

#!/usr/bin/env bash
set -e
set -o pipefail
# globals
azurefile=0
blobxferversion=latest
block=
cascadecontainer=0
encrypted=
hpnssh=0
gluster_on_compute=0
gpu=
networkopt=0
offer=
p2p=
p2penabled=0
prefix=
privatereg=
sku=
sc_args=
version=
# process command line options
while getopts "h?abde:fg:m:no:p:r:s:t:v:wx:" opt; do
case "$opt" in
h|\?)
echo "shipyard_nodeprep.sh parameters"
echo ""
echo "-a install azurefile docker volume driver"
echo "-b block until resources loaded"
echo "-d use docker container for cascade"
echo "-e [thumbprint] encrypted credentials with cert"
echo "-f set up glusterfs on compute"
echo "-g [nv-series:driver file:nvidia docker pkg] gpu support"
echo "-m [type:scid] mount storage cluster"
echo "-n optimize network TCP settings"
echo "-o [offer] VM offer"
echo "-p [prefix] storage container prefix"
echo "-r [container:archive:image id] private registry"
echo "-s [sku] VM sku"
echo "-t [enabled:non-p2p concurrent download:seed bias:compression:pub pull passthrough] p2p sharing"
echo "-v [version] batch-shipyard version"
echo "-w install openssh-hpn"
echo "-x [blobxfer version] blobxfer version"
echo ""
exit 1
;;
a)
azurefile=1
;;
b)
block=$SHIPYARD_DOCKER_IMAGES_PRELOAD
;;
d)
cascadecontainer=1
;;
e)
encrypted=${OPTARG,,}
;;
f)
gluster_on_compute=1
;;
g)
gpu=$OPTARG
;;
m)
IFS=',' read -ra sc_args <<< "${OPTARG,,}"
;;
n)
networkopt=1
;;
o)
offer=${OPTARG,,}
;;
p)
prefix="--prefix $OPTARG"
;;
r)
privatereg=$OPTARG
;;
s)
sku=${OPTARG,,}
;;
t)
p2p=${OPTARG,,}
IFS=':' read -ra p2pflags <<< "$p2p"
if [ ${p2pflags[0]} == "true" ]; then
p2penabled=1
else
p2penabled=0
fi
;;
v)
version=$OPTARG
;;
w)
hpnssh=1
;;
x)
blobxferversion=$OPTARG
;;
esac
done
shift $((OPTIND-1))
[ "$1" = "--" ] && shift
# check args
if [ -z $offer ]; then
echo "ERROR: vm offer not specified"
exit 1
fi
if [ -z $sku ]; then
echo "ERROR: vm sku not specified"
exit 1
fi
if [ -z $version ]; then
echo "ERROR: batch-shipyard version not specified"
exit 1
fi
contains() {
string="$1"
substring="$2"
if test "${string#*$substring}" != "$string"; then
return 0
else
return 1
fi
}
check_for_buggy_ntfs_mount() {
# Check to ensure sdb1 mount is not mounted as ntfs
set +e
mount | grep /dev/sdb1 | grep fuseblk
rc=$?
set -e
if [ $rc -eq 0 ]; then
echo "ERROR: /dev/sdb1 temp disk is mounted as fuseblk/ntfs"
exit 1
fi
}
check_for_nvidia_card() {
set +e
out=$(lspci)
echo "$out" | grep -i nvidia > /dev/null
rc=$?
set -e
echo "$out"
if [ $rc -ne 0 ]; then
echo "ERROR: No Nvidia card(s) detected!"
exit 1
fi
}
install_nvidia_software() {
offer=$1
shift
# check for nvidia card
check_for_nvidia_card
# split arg into two
IFS=':' read -ra GPUARGS <<< "$gpu"
nvdriver=${GPUARGS[1]}
nvdocker=${GPUARGS[2]}
# remove nouveau
rmmod nouveau
# purge nouveau off system
if [ $offer == "ubuntuserver" ]; then
apt-get --purge remove xserver-xorg-video-nouveau xserver-xorg-video-nouveau-hwe-16.04
elif [[ $offer == centos* ]]; then
yum erase -y xorg-x11-drv-nouveau
else
echo "ERROR: unsupported distribution for nvidia/GPU, offer: $offer"
exit 1
fi
# blacklist nouveau from being loaded if rebooted
cat > /etc/modprobe.d/blacklist-nouveau.conf << EOF
blacklist nouveau
blacklist lbm-nouveau
options nouveau modeset=0
alias nouveau off
alias lbm-nouveau off
EOF
# get development essentials for nvidia driver
if [ $offer == "ubuntuserver" ]; then
install_packages $offer build-essential
elif [[ $offer == centos* ]]; then
install_packages $offer gcc binutils make "kernel-devel-$(uname -r)"
fi
# get additional dependency if NV-series VMs
if [ ${GPUARGS[0]} == "True" ]; then
if [ $offer == "ubuntuserver" ]; then
install_packages $offer xserver-xorg-dev
elif [[ $offer == centos* ]]; then
install_packages $offer xorg-x11-server-devel
fi
fi
# install driver
./$nvdriver -s
# add flag to config template for GRID driver
if [ ${GPUARGS[0]} == "True" ]; then
echo "IgnoreSP=TRUE" >> /etc/nvidia/gridd.conf.template
fi
# install nvidia-docker
if [ $offer == "ubuntuserver" ]; then
dpkg -i $nvdocker
elif [[ $offer == centos* ]]; then
rpm -Uvh $nvdocker
fi
# enable and start nvidia docker service
systemctl enable nvidia-docker.service
systemctl start nvidia-docker.service
systemctl status nvidia-docker.service
# get driver version
nvdriverver=`cat /proc/driver/nvidia/version | grep "Kernel Module" | cut -d ' ' -f 9`
echo nvidia driver version $nvdriverver detected
# create the docker volume now to avoid volume driver conflicts for
# tasks. run this in a loop as it can fail if triggered too quickly
# after start
NV_START=$(date -u +"%s")
set +e
while :
do
echo "INFO: Attempting to create nvidia-docker volume with version $nvdriverver"
docker volume create -d nvidia-docker --name nvidia_driver_$nvdriverver
if [ $? -eq 0 ]; then
docker volume list
break
else
NV_NOW=$(date -u +"%s")
NV_DIFF=$((($NV_NOW-$NV_START)/60))
# fail after 5 minutes of attempts
if [ $NV_DIFF -ge 5 ]; then
echo "ERROR: could not create nvidia-docker volume"
exit 1
fi
sleep 1
fi
done
set -e
}
install_azurefile_docker_volume_driver() {
chown root:root azurefile-dockervolumedriver*
chmod 755 azurefile-dockervolumedriver
chmod 640 azurefile-dockervolumedriver.env
mv azurefile-dockervolumedriver /usr/bin
mv azurefile-dockervolumedriver.env /etc/default/azurefile-dockervolumedriver
if [[ $1 == "ubuntuserver" ]] && [[ $2 == 14.04.* ]]; then
mv azurefile-dockervolumedriver.conf /etc/init
initctl reload-configuration
initctl start azurefile-dockervolumedriver
else
if [[ $1 == opensuse* ]] || [[ $1 == sles* ]]; then
systemdloc=/usr/lib/systemd/system
else
systemdloc=/lib/systemd/system
fi
mv azurefile-dockervolumedriver.service $systemdloc
systemctl daemon-reload
systemctl enable azurefile-dockervolumedriver
systemctl start azurefile-dockervolumedriver
fi
# create docker volumes
chmod +x azurefile-dockervolume-create.sh
./azurefile-dockervolume-create.sh
# list volumes
docker volume list
}
refresh_package_index() {
offer=$1
set +e
retries=30
while [ $retries -gt 0 ]; do
if [[ $offer == "ubuntuserver" ]] || [[ $offer == "debian" ]]; then
apt-get update
elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-linux" ]]; then
yum makecache -y fast
elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
zypper -n --gpg-auto-import-keys ref
fi
if [ $? -eq 0 ]; then
break
fi
let retries=retries-1
if [ $retries -eq 0 ]; then
echo "ERROR: Could not update package index"
exit 1
fi
sleep 1
done
set -e
}
install_packages() {
offer=$1
shift
set +e
retries=30
while [ $retries -gt 0 ]; do
if [[ $offer == "ubuntuserver" ]] || [[ $offer == "debian" ]]; then
apt-get install -y -q -o Dpkg::Options::="--force-confnew" --no-install-recommends $*
elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-linux" ]]; then
yum install -y $*
elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
zypper -n in $*
fi
if [ $? -eq 0 ]; then
break
fi
let retries=retries-1
if [ $retries -eq 0 ]; then
echo "ERROR: Could not install packages: $*"
exit 1
fi
sleep 1
done
set -e
}
docker_pull_image() {
image=$1
set +e
retries=60
while [ $retries -gt 0 ]; do
pull_out=$(docker pull $image 2>&1)
rc=$?
if [ $rc -eq 0 ]; then
echo "$pull_out"
break
fi
# non-zero exit code: check if pull output has toomanyrequests or
# connection resets
if [ contains "$pull_out" "toomanyrequests" ] || [ contains "$pull_out" "connection reset by peer" ]; then
echo "WARNING: will retry:\n$pull_out"
else
echo "ERROR:\n$pull_out"
exit $rc
fi
let retries=retries-1
if [ $retries -le 0 ]; then
echo "ERROR: Could not pull docker image: $image"
exit $rc
fi
sleep $[($RANDOM % 5) + 1]s
done
set -e
}
# check sdb1 mount
check_for_buggy_ntfs_mount
# set python env vars
LC_ALL=en_US.UTF-8
PYTHONASYNCIODEBUG=1
# store node prep start
if command -v python3 > /dev/null 2>&1; then
npstart=`python3 -c 'import datetime;print(datetime.datetime.utcnow().timestamp())'`
else
npstart=`python -c 'import datetime;import time;print(time.mktime(datetime.datetime.utcnow().timetuple()))'`
fi
# set node prep status files
nodeprepfinished=$AZ_BATCH_NODE_SHARED_DIR/.node_prep_finished
cascadefailed=$AZ_BATCH_NODE_SHARED_DIR/.cascade_failed
# get ip address of eth0
ipaddress=`ip addr list eth0 | grep "inet " | cut -d' ' -f6 | cut -d/ -f1`
# decrypt encrypted creds
if [ ! -z $encrypted ]; then
# convert pfx to pem
pfxfile=$AZ_BATCH_CERTIFICATES_DIR/sha1-$encrypted.pfx
privatekey=$AZ_BATCH_CERTIFICATES_DIR/key.pem
openssl pkcs12 -in $pfxfile -out $privatekey -nodes -password file:$pfxfile.pw
# remove pfx-related files
rm -f $pfxfile $pfxfile.pw
# decrypt creds
SHIPYARD_STORAGE_ENV=`echo $SHIPYARD_STORAGE_ENV | base64 -d | openssl rsautl -decrypt -inkey $privatekey`
if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
DOCKER_LOGIN_PASSWORD=`echo $DOCKER_LOGIN_PASSWORD | base64 -d | openssl rsautl -decrypt -inkey $privatekey`
fi
if [ ! -z $privatereg ]; then
SHIPYARD_PRIVATE_REGISTRY_STORAGE_ENV=`echo $SHIPYARD_PRIVATE_REGISTRY_STORAGE_ENV | base64 -d | openssl rsautl -decrypt -inkey $privatekey`
fi
fi
# set iptables rules
if [ $p2penabled -eq 1 ]; then
# disable DHT connection tracking
iptables -t raw -I PREROUTING -p udp --dport 6881 -j CT --notrack
iptables -t raw -I OUTPUT -p udp --sport 6881 -j CT --notrack
fi
# check if we're coming up from a reboot
if [ -f $cascadefailed ]; then
echo "ERROR: $cascadefailed file exists, assuming cascade failure during node prep"
exit 1
elif [ -f $nodeprepfinished ]; then
echo "INFO: $nodeprepfinished file exists, assuming successful completion of node prep"
exit 0
fi
# one-time setup
if [ ! -f $nodeprepfinished ]; then
# set up hpn-ssh
if [ $hpnssh -eq 1 ]; then
./shipyard_hpnssh.sh $offer $sku
fi
# optimize network TCP settings
if [ $networkopt -eq 1 ]; then
sysctlfile=/etc/sysctl.d/60-azure-batch-shipyard.conf
if [ ! -e $sysctlfile ] || [ ! -s $sysctlfile ]; then
cat > $sysctlfile << EOF
net.core.rmem_default=16777216
net.core.wmem_default=16777216
net.core.rmem_max=16777216
net.core.wmem_max=16777216
net.core.netdev_max_backlog=30000
net.ipv4.tcp_max_syn_backlog=80960
net.ipv4.tcp_mem=16777216 16777216 16777216
net.ipv4.tcp_rmem=4096 87380 16777216
net.ipv4.tcp_wmem=4096 65536 16777216
net.ipv4.tcp_slow_start_after_idle=0
net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_abort_on_overflow=1
net.ipv4.route.flush=1
EOF
fi
fi
fi
# install docker host engine
if [ $offer == "ubuntuserver" ] || [ $offer == "debian" ]; then
DEBIAN_FRONTEND=noninteractive
# name will be appended to dockerversion
dockerversion=17.06.0~ce-0~
name=
if [[ $sku == 14.04.* ]]; then
name=ubuntu-trusty
srvstart="initctl start docker"
srvstop="initctl stop docker"
srvstatus="initctl status docker"
gfsstart="initctl start glusterfs-server"
gpgkey=https://download.docker.com/linux/ubuntu/gpg
repo=https://download.docker.com/linux/ubuntu
dockerversion=${dockerversion}ubuntu
elif [[ $sku == 16.04* ]]; then
name=ubuntu-xenial
srvstart="systemctl start docker.service"
srvstop="systemctl stop docker.service"
srvenable="systemctl enable docker.service"
srvstatus="systemctl status docker.service"
gfsstart="systemctl start glusterfs-server"
gfsenable="systemctl enable glusterfs-server"
gpgkey=https://download.docker.com/linux/ubuntu/gpg
repo=https://download.docker.com/linux/ubuntu
dockerversion=${dockerversion}ubuntu
elif [[ $sku == "8" ]]; then
name=debian-jessie
srvstart="systemctl start docker.service"
srvstop="systemctl stop docker.service"
srvenable="systemctl enable docker.service"
srvstatus="systemctl status docker.service"
gfsstart="systemctl start glusterfs-server"
gfsenable="systemctl enable glusterfs-server"
gpgkey=https://download.docker.com/linux/debian/gpg
repo=https://download.docker.com/linux/debian
dockerversion=${dockerversion}debian
else
echo "ERROR: unsupported sku: $sku for offer: $offer"
exit 1
fi
if [ ! -z $gpu ] && [ $name != "ubuntu-xenial" ]; then
echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
exit 1
fi
# reload network settings
if [ $networkopt -eq 1 ]; then
if [ $name == "ubuntu-trusty" ]; then
service procps start
else
service procps reload
fi
fi
# refresh package index
refresh_package_index $offer
# install required software first
install_packages $offer apt-transport-https ca-certificates curl software-properties-common
if [ $name == "ubuntu-trusty" ]; then
install_packages $offer linux-image-extra-$(uname -r) linux-image-extra-virtual
fi
# add gpgkey for repo
set +e
retries=100
while [ $retries -gt 0 ]; do
curl -fsSL $gpgkey | apt-key add -
if [ $? -eq 0 ]; then
break
fi
let retries=retries-1
if [ $retries -eq 0 ]; then
echo "ERROR: Could not add key for docker repo"
exit 1
fi
sleep 1
done
set -e
# add repo
add-apt-repository "deb [arch=amd64] $repo $(lsb_release -cs) stable"
# refresh index
refresh_package_index $offer
# ensure docker opts service modifications are idempotent
set +e
grep '^DOCKER_OPTS=' /etc/default/docker
if [ $? -ne 0 ]; then
# install docker engine
install_packages $offer docker-ce=$dockerversion
set -e
$srvstop
set +e
rm -f /var/lib/docker/network/files/local-kv.db
if [ $name == "debian-jessie" ]; then
mkdir -p /mnt/resource/docker-tmp
sed -i -e 's,.*export DOCKER_TMPDIR=.*,export DOCKER_TMPDIR="/mnt/resource/docker-tmp",g' /etc/default/docker || echo export DOCKER_TMPDIR=\"/mnt/resource/docker-tmp\" >> /etc/default/docker
sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\"|;b};$q1' /etc/default/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\" >> /etc/default/docker
else
mkdir -p /mnt/docker-tmp
sed -i -e 's,.*export DOCKER_TMPDIR=.*,export DOCKER_TMPDIR="/mnt/docker-tmp",g' /etc/default/docker || echo export DOCKER_TMPDIR=\"/mnt/docker-tmp\" >> /etc/default/docker
sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/docker\"|;b};$q1' /etc/default/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/docker\" >> /etc/default/docker
fi
if [[ $name == "ubuntu-xenial" ]] || [[ $name == "debian-jessie" ]]; then
sed -i '/^\[Service\]/a EnvironmentFile=/etc/default/docker' /lib/systemd/system/docker.service
sed -i '/^ExecStart=/ s/$/ $DOCKER_OPTS/' /lib/systemd/system/docker.service
set -e
systemctl daemon-reload
$srvenable
set +e
fi
set -e
$srvstart
# setup and start azure file docker volume driver
if [ $azurefile -eq 1 ]; then
install_azurefile_docker_volume_driver $offer $sku
fi
set +e
fi
set -e
# ensure docker daemon is running
$srvstatus
# install gpu related items
if [ ! -z $gpu ] && [ ! -f $nodeprepfinished ]; then
install_nvidia_software $offer
fi
# set up glusterfs
if [ $gluster_on_compute -eq 1 ] && [ ! -f $nodeprepfinished ]; then
install_packages $offer glusterfs-server
if [[ ! -z $gfsenable ]]; then
$gfsenable
fi
$gfsstart
# create brick directory
mkdir -p /mnt/gluster
fi
# install dependencies for storage cluster mount
if [ ! -z $sc_args ]; then
for sc_arg in ${sc_args[@]}; do
IFS=':' read -ra sc <<< "$sc_arg"
server_type=${sc[0]}
if [ $server_type == "nfs" ]; then
install_packages $offer nfs-common nfs4-acl-tools
elif [ $server_type == "glusterfs" ]; then
install_packages $offer glusterfs-client acl
else
echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
exit 1
fi
done
fi
# install dependencies if not using cascade container
if [ $cascadecontainer -eq 0 ]; then
# install azure storage python dependency
install_packages $offer build-essential libssl-dev libffi-dev libpython3-dev python3-dev python3-pip
pip3 install --no-cache-dir azure-storage==0.35.1
# install cascade dependencies
if [ $p2penabled -eq 1 ]; then
install_packages $offer python3-libtorrent pigz
fi
fi
elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-linux" ]]; then
# ensure container only support
if [ $cascadecontainer -eq 0 ]; then
echo "ERROR: only supported through shipyard container"
exit 1
fi
# gpu is not supported on these offers
if [[ ! -z $gpu ]] && [[ $offer != centos* ]]; then
echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
exit 1
fi
if [[ $sku == 7.* ]]; then
dockerversion=17.06.0.ce-1.el7.centos
if [[ $offer == "oracle-linux" ]]; then
srvenable="systemctl enable docker.service"
srvstart="systemctl start docker.service"
srvstatus="systemctl status docker.service"
gfsenable="systemctl enable glusterd"
rpcbindenable="systemctl enable rpcbind"
# TODO, in order to support docker > 1.9, need to upgrade to UEKR4
echo "ERROR: oracle linux is not supported at this time"
exit 1
else
srvenable="chkconfig docker on"
srvstart="systemctl start docker.service"
srvstatus="systemctl status docker.service"
gfsenable="chkconfig glusterd on"
rpcbindenable="chkconfig rpcbind on"
fi
else
echo "ERROR: unsupported sku: $sku for offer: $offer"
exit 1
fi
# reload network settings
if [ $networkopt -eq 1 ]; then
sysctl -p
fi
# add docker repo to yum
install_packages $offer yum-utils device-mapper-persistent-data lvm2
yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
refresh_package_index $offer
install_packages $offer docker-ce-$dockerversion
# modify docker opts
mkdir -p /mnt/resource/docker-tmp
sed -i -e 's,.*export DOCKER_TMPDIR=.*,export DOCKER_TMPDIR="/mnt/resource/docker-tmp",g' /etc/default/docker || echo export DOCKER_TMPDIR=\"/mnt/resource/docker-tmp\" >> /etc/default/docker
sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\"|;b};$q1' /etc/default/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\" >> /etc/default/docker
sed -i '/^\[Service\]/a EnvironmentFile=/etc/default/docker' /lib/systemd/system/docker.service
sed -i '/^ExecStart=/ s/$/ $DOCKER_OPTS/' /lib/systemd/system/docker.service
systemctl daemon-reload
# start docker service and enable docker daemon on boot
$srvenable
$srvstart
$srvstatus
# setup and start azure file docker volume driver
if [ $azurefile -eq 1 ]; then
install_azurefile_docker_volume_driver $offer $sku
fi
# install gpu related items
if [ ! -z $gpu ] && [ ! -f $nodeprepfinished ]; then
install_nvidia_software $offer
fi
# set up glusterfs
if [ $gluster_on_compute -eq 1 ] && [ ! -f $nodeprepfinished ]; then
install_packages $offer epel-release centos-release-gluster38
sed -i -e "s/enabled=1/enabled=0/g" /etc/yum.repos.d/CentOS-Gluster-3.8.repo
install_packages $offer --enablerepo=centos-gluster38,epel glusterfs-server
systemctl daemon-reload
$gfsenable
systemctl start glusterd
# create brick directory
mkdir -p /mnt/resource/gluster
fi
# install dependencies for storage cluster mount
if [ ! -z $sc_args ]; then
for sc_arg in ${sc_args[@]}; do
IFS=':' read -ra sc <<< "$sc_arg"
server_type=${sc[0]}
if [ $server_type == "nfs" ]; then
install_packages $offer nfs-utils nfs4-acl-tools
systemctl daemon-reload
$rpcbindenable
systemctl start rpcbind
elif [ $server_type == "glusterfs" ]; then
install_packages $offer epel-release centos-release-gluster38
sed -i -e "s/enabled=1/enabled=0/g" /etc/yum.repos.d/CentOS-Gluster-3.8.repo
install_packages $offer --enablerepo=centos-gluster38,epel glusterfs-server acl
else
echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
exit 1
fi
done
fi
elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
# ensure container only support
if [ $cascadecontainer -eq 0 ]; then
echo "ERROR: only supported through shipyard container"
exit 1
fi
# gpu is not supported on these offers
if [ ! -z $gpu ]; then
echo "ERROR: gpu unsupported on this sku: $sku for offer $offer"
exit 1
fi
# reload network settings
if [ $networkopt -eq 1 ]; then
sysctl -p
fi
if [ ! -f $nodeprepfinished ]; then
# add Virtualization:containers repo for recent docker builds
repodir=
if [[ $offer == opensuse* ]]; then
dockerversion=1.12.6-30.2
if [[ $sku == "42.1" ]]; then
repodir=openSUSE_Leap_42.1
elif [[ $sku == "42.2" ]]; then
repodir=openSUSE_Leap_42.2
fi
# add container repo for zypper
zypper addrepo http://download.opensuse.org/repositories/Virtualization:containers/$repodir/Virtualization:containers.repo
elif [[ $offer == sles* ]]; then
dockerversion=1.12.6-90.1
if [[ $sku == "12-sp1" ]]; then
repodir=SLE_12_SP1
elif [[ $sku == "12-sp2" ]]; then
repodir=SLE_12_SP2
fi
# enable container module
SUSEConnect -p sle-module-containers/12/x86_64 -r ''
fi
if [ -z $repodir ]; then
echo "ERROR: unsupported sku: $sku for offer: $offer"
exit 1
fi
# update index
refresh_package_index $offer
# install docker engine
install_packages $offer docker-$dockerversion
# modify docker opts, docker opts in /etc/sysconfig/docker
mkdir -p /mnt/resource/docker-tmp
sed -i -e 's,.*export DOCKER_TMPDIR=.*,export DOCKER_TMPDIR="/mnt/resource/docker-tmp",g' /etc/default/docker || echo export DOCKER_TMPDIR=\"/mnt/resource/docker-tmp\" >> /etc/default/docker
sed -i -e '/^DOCKER_OPTS=.*/,${s||DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\"|;b};$q1' /etc/sysconfig/docker || echo DOCKER_OPTS=\"-H tcp://127.0.0.1:2375 -H unix:///var/run/docker.sock -g /mnt/resource/docker\" >> /etc/sysconfig/docker
systemctl daemon-reload
# start docker service and enable docker daemon on boot
systemctl enable docker
systemctl start docker
systemctl status docker
# setup and start azure file docker volume driver
if [ $azurefile -eq 1 ]; then
install_azurefile_docker_volume_driver $offer $sku
fi
# set up glusterfs
if [ $gluster_on_compute -eq 1 ]; then
zypper addrepo http://download.opensuse.org/repositories/filesystems/$repodir/filesystems.repo
zypper -n --gpg-auto-import-keys ref
install_packages $offer glusterfs
systemctl daemon-reload
systemctl enable glusterd
systemctl start glusterd
# create brick directory
mkdir -p /mnt/resource/gluster
fi
# install dependencies for storage cluster mount
if [ ! -z $sc_args ]; then
for sc_arg in ${sc_args[@]}; do
IFS=':' read -ra sc <<< "$sc_arg"
server_type=${sc[0]}
if [ $server_type == "nfs" ]; then
install_packages $offer nfs-client nfs4-acl-tools
systemctl daemon-reload
systemctl enable rpcbind
systemctl start rpcbind
elif [ $server_type == "glusterfs" ]; then
zypper addrepo http://download.opensuse.org/repositories/filesystems/$repodir/filesystems.repo
zypper -n --gpg-auto-import-keys ref
install_packages $offer glusterfs acl
else
echo "ERROR: Unknown file server type ${sc[0]} for ${sc[1]}"
exit 1
fi
done
fi
# if hpc sku, set up intel mpi
if [[ $offer == sles-hpc* ]]; then
if [ $sku != "12-sp1" ]; then
echo "ERROR: unsupported sku for intel mpi setup on SLES"
exit 1
fi
install_packages $offer lsb
rpm -Uvh --nodeps /opt/intelMPI/intel_mpi_packages/*.rpm
mkdir -p /opt/intel/compilers_and_libraries/linux
ln -s /opt/intel/impi/5.0.3.048 /opt/intel/compilers_and_libraries/linux/mpi
fi
fi
else
echo "ERROR: unsupported offer: $offer (sku: $sku)"
exit 1
fi
# retrieve docker images related to data movement
docker_pull_image alfpark/blobxfer:$blobxferversion
docker_pull_image alfpark/batch-shipyard:tfm-$version
docker_pull_image alfpark/batch-shipyard:rjm-$version
# login to registry server
if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
docker login -u $DOCKER_LOGIN_USERNAME -p $DOCKER_LOGIN_PASSWORD $DOCKER_LOGIN_SERVER
fi
# mount any storage clusters
if [ ! -z $sc_args ]; then
# eval and split fstab var to expand vars (this is ok since it is set by shipyard)
fstab_mounts=$(eval echo "$SHIPYARD_STORAGE_CLUSTER_FSTAB")
IFS='#' read -ra fstabs <<< "$fstab_mounts"
i=0
for sc_arg in ${sc_args[@]}; do
IFS=':' read -ra sc <<< "$sc_arg"
mountpoint=$AZ_BATCH_NODE_SHARED_DIR/${sc[1]}
echo "INFO: Creating host directory for storage cluster $sc_arg at $mountpoint"
mkdir -p $mountpoint
chmod 777 $mountpoint
echo "INFO: Adding $mountpoint to fstab"
# eval fstab var to expand vars (this is ok since it is set by shipyard)
fstab_entry="${fstabs[$i]}"
echo $fstab_entry >> /etc/fstab
tail -n1 /etc/fstab
echo "INFO: Mounting $mountpoint"
START=$(date -u +"%s")
set +e
while :
do
mount $mountpoint
if [ $? -eq 0 ]; then
break
else
NOW=$(date -u +"%s")
DIFF=$((($NOW-$START)/60))
# fail after 5 minutes of attempts
if [ $DIFF -ge 5 ]; then
echo "ERROR: Could not mount storage cluster $sc_arg on: $mountpoint"
exit 1
fi
sleep 1
fi
done
set -e
echo "INFO: $mountpoint mounted."
i=$(($i + 1))
done
fi
# touch node prep finished file to preserve idempotency
touch $nodeprepfinished
# touch cascade failed file, this will be removed once cascade is successful
touch $cascadefailed
# execute cascade
set +e
cascadepid=
envfile=
if [ $cascadecontainer -eq 1 ]; then
detached=
if [ $p2penabled -eq 1 ]; then
detached="-d"
else
detached="--rm"
fi
# store docker cascade start
if command -v python3 > /dev/null 2>&1; then
drpstart=`python3 -c 'import datetime;print(datetime.datetime.utcnow().timestamp())'`
else
drpstart=`python -c 'import datetime;import time;print(time.mktime(datetime.datetime.utcnow().timetuple()))'`
fi
# create env file
envfile=.docker_cascade_envfile
cat > $envfile << EOF
prefix=$prefix
ipaddress=$ipaddress
offer=$offer
sku=$sku
npstart=$npstart
drpstart=$drpstart
privatereg=$privatereg
p2p=$p2p
`env | grep SHIPYARD_`
`env | grep AZ_BATCH_`
`env | grep DOCKER_LOGIN_`
EOF
chmod 600 $envfile
# pull image
docker_pull_image alfpark/batch-shipyard:cascade-$version
# launch container
docker run $detached --net=host --env-file $envfile \
-v /var/run/docker.sock:/var/run/docker.sock \
-v $AZ_BATCH_NODE_ROOT_DIR:$AZ_BATCH_NODE_ROOT_DIR \
-w $AZ_BATCH_TASK_WORKING_DIR \
-p 6881-6891:6881-6891 -p 6881-6891:6881-6891/udp \
alfpark/batch-shipyard:cascade-$version &
cascadepid=$!
else
# backfill node prep start
if [ ! -z ${SHIPYARD_TIMING+x} ]; then
./perf.py nodeprep start $prefix --ts $npstart --message "offer=$offer,sku=$sku"
fi
# install private registry if required
if [ ! -z $privatereg ]; then
# mark private registry start
if [ ! -z ${SHIPYARD_TIMING+x} ]; then
./perf.py privateregistry start $prefix --message "ipaddress=$ipaddress"
fi
./setup_private_registry.py $privatereg $ipaddress $prefix
# mark private registry end
if [ ! -z ${SHIPYARD_TIMING+x} ]; then
./perf.py privateregistry end $prefix
fi
fi
# mark node prep finished
if [ ! -z ${SHIPYARD_TIMING+x} ]; then
./perf.py nodeprep end $prefix
fi
# start cascade
if [ ! -z ${SHIPYARD_TIMING+x} ]; then
./perf.py cascade start $prefix
fi
./cascade.py $p2p --ipaddress $ipaddress $prefix &
cascadepid=$!
fi
# if not in p2p mode, then wait for cascade exit
if [ $p2penabled -eq 0 ]; then
wait $cascadepid
rc=$?
if [ $rc -ne 0 ]; then
echo "ERROR: cascade exited with non-zero exit code: $rc"
rm -f $nodeprepfinished
exit $rc
fi
fi
set -e
# remove cascade failed file
rm -f $cascadefailed
# block until images ready if specified
if [ ! -z $block ]; then
echo "INFO: blocking until images ready: $block"
IFS=',' read -ra RES <<< "$block"
declare -a missing
while :
do
for image in "${RES[@]}"; do
if [ -z "$(docker images -q $image 2>/dev/null)" ]; then
missing=("${missing[@]}" "$image")
fi
done
if [ ${#missing[@]} -eq 0 ]; then
echo "INFO: all docker images present"
break
else
unset missing
fi
sleep 2
done
if [ $cascadecontainer -eq 1 ]; then
rm -f $envfile
fi
fi