Support glusterfs <-> pool autolinking

- Support glusterfs expand (additional disks)
- Provide `mount_options` for `file_server` which applies to local mount
on the file server of the disks
- Allow gluster volume name to be specified
- Provide stronger cross-checking between pool virtual network and
storage cluster virtual network
- Increase ud/fd in AS to maximums
- Install acl tools for nfsv4 and glusterfs
This commit is contained in:
Fred Park 2017-03-11 15:23:55 -08:00
Родитель 0ed28d96fc
Коммит cb7b42a231
11 изменённых файлов: 230 добавлений и 42 удалений

Просмотреть файл

@ -15,7 +15,7 @@ RUN apk update \
musl build-base python3 python3-dev openssl-dev libffi-dev \
ca-certificates boost boost-dev boost-python3 file curl tar pigz \
docker bash \
&& pip3 install --no-cache-dir --upgrade pip azure-storage==0.33.0 \
&& pip3 install --no-cache-dir --upgrade pip azure-storage==0.34.0 \
&& curl -SL https://github.com/arvidn/libtorrent/releases/download/libtorrent-${libtorrent_version//./_}/libtorrent-rasterbar-${libtorrent_version}.tar.gz -o libtorrent-${libtorrent_version}.tar.gz \
&& tar zxvpf libtorrent-${libtorrent_version}.tar.gz \
&& cd libtorrent-rasterbar-${libtorrent_version} \

Просмотреть файл

@ -115,9 +115,15 @@
"volume_options": [
]
},
"mystoragecluster": {
"my_nfs_cluster": {
"volume_driver": "storage_cluster",
"container_path": "$AZ_BATCH_NODE_SHARED_DIR/mystoragecluster",
"container_path": "$AZ_BATCH_NODE_SHARED_DIR/my_nfs_cluster",
"mount_options": [
]
},
"my_glusterfs_cluster": {
"volume_driver": "storage_cluster",
"container_path": "$AZ_BATCH_NODE_SHARED_DIR/my_glusterfs_cluster",
"mount_options": [
]
}

Просмотреть файл

@ -42,8 +42,13 @@
"file_server": {
"type": "nfs",
"mountpoint": "/data",
"mount_options": [
"noatime",
"nodiratime"
],
"server_options": {
"glusterfs": {
"volume_name": "gv0",
"volume_type": "replica",
"transport": "tcp",
"performance.cache-size": "1 GB"

Просмотреть файл

@ -45,6 +45,7 @@ import uuid
# non-stdlib imports
import azure.batch.models as batchmodels
import azure.mgmt.batch.models as batchmgmtmodels
import azure.mgmt.compute.models as computemodels
# local imports
from . import batch
from . import crypto
@ -439,12 +440,29 @@ def _create_storage_cluster_mount_args(
sdv, sc.id)):
raise RuntimeError(
'No storage cluster {} found in configuration'.format(sc.id))
# check for same vnet
if vnet.name != sc.virtual_network.name:
# check for same vnet name
if vnet.name.lower() != sc.virtual_network.name.lower():
raise RuntimeError(
'cannot link storage cluster {} on virtual '
'network {} with pool virtual network {}'.format(
sc.id, sc.virtual_network.name, vnet.name))
# cross check vnet resource group
_vnet_tmp = vnet.id.lower().split('/')
if _vnet_tmp[4] != sc.virtual_network.resource_group.lower():
raise RuntimeError(
'cannot link storage cluster {} virtual network in resource group '
'{} with pool virtual network in resource group {}'.format(
sc.id, sc.virtual_network.resource_group,
_vnet_tmp[4]))
# cross check vnet subscription id
_ba_tmp = ba.id.lower().split('/')
if _vnet_tmp[2] != _ba_tmp[2]:
raise RuntimeError(
'cannot link storage cluster {} virtual network in subscription '
'{} with pool virtual network in subscription {}'.format(
sc.id, _vnet_tmp[2], _ba_tmp[2]))
del _vnet_tmp
del _ba_tmp
# get vm count
if sc.vm_count < 1:
raise RuntimeError(
@ -455,11 +473,11 @@ def _create_storage_cluster_mount_args(
# query first vm for info
vm_name = '{}-vm{}'.format(sc.hostname_prefix, 0)
vm = compute_client.virtual_machines.get(
resource_group_name=rfs.resource_group,
resource_group_name=sc.resource_group,
vm_name=vm_name,
)
nic = resource.get_nic_from_virtual_machine(
network_client, rfs.resource_group, vm)
network_client, sc.resource_group, vm)
# get private ip of vm
remote_ip = nic.ip_configurations[0].private_ip_address
# construct mount options
@ -490,10 +508,88 @@ def _create_storage_cluster_mount_args(
mo=mo,
)
elif sc.file_server.type == 'glusterfs':
# TODO
# walk all vms and find non-overlapping ud/fds
# use non-overlapping backupvolfile-server in mount option
raise NotImplementedError()
# walk vms and find non-overlapping ud/fds
primary_ip = None
primary_ud = None
primary_fd = None
backup_ip = None
backup_ud = None
backup_fd = None
vms = {}
# first pass, attempt to populate all ip, ud/fd
for i in range(sc.vm_count):
vm_name = '{}-vm{}'.format(sc.hostname_prefix, i)
vm = compute_client.virtual_machines.get(
resource_group_name=sc.resource_group,
vm_name=vm_name,
expand=computemodels.InstanceViewTypes.instance_view,
)
nic = resource.get_nic_from_virtual_machine(
network_client, sc.resource_group, vm)
vms[i] = (vm, nic)
# get private ip and ud/fd of vm
remote_ip = nic.ip_configurations[0].private_ip_address
ud = vm.instance_view.platform_update_domain
fd = vm.instance_view.platform_fault_domain
if primary_ip is None:
primary_ip = remote_ip
primary_ud = ud
primary_fd = fd
if backup_ip is None:
if (primary_ip == backup_ip or primary_ud == ud
or primary_fd == fd):
continue
backup_ip = remote_ip
backup_ud = ud
backup_fd = fd
# second pass, fill in with at least non-overlapping update domains
if backup_ip is None:
for i in range(sc.vm_count):
vm, nic = vms[i]
remote_ip = nic.ip_configurations[0].private_ip_address
ud = vm.instance_view.platform_update_domain
fd = vm.instance_view.platform_fault_domain
if primary_ud != ud:
backup_ip = remote_ip
backup_ud = ud
backup_fd = fd
break
if primary_ip is None or backup_ip is None:
raise RuntimeError(
'Could not find either a primary ip {} or backup ip {} for '
'glusterfs client mount'.format(primary_ip, backup_ip))
logger.debug('primary ip/ud/fd={} backup ip/ud/fd={}'.format(
(primary_ip, primary_ud, primary_fd),
(backup_ip, backup_ud, backup_fd)))
# construct mount options
mo = '_netdev,auto,transport=tcp,backupvolfile-server={}'.format(
backup_ip)
amo = settings.shared_data_volume_mount_options(sdv, sc.id)
if util.is_not_empty(amo):
if any([x.startswith('backupvolfile-server=') for x in amo]):
raise RuntimeError(
('backupvolfile-server cannot be specified as a mount '
'option for storage cluster {}').format(sc.id))
if any([x.startswith('transport=') for x in amo]):
raise RuntimeError(
('transport cannot be specified as a mount option for '
'storage cluster {}').format(sc.id))
mo = ','.join((mo, ','.join(amo)))
# get gluster volume name
try:
volname = sc.file_server.server_options['glusterfs']['volume_name']
except KeyError:
volname = remotefs._GLUSTER_DEFAULT_VOLNAME
# construct mount string for fstab, srcpath is the gluster volume
fstab_mount = (
'{remoteip}:/{srcpath} $AZ_BATCH_NODE_SHARED_DIR/{scid} '
'{fstype} {mo} 0 2').format(
remoteip=primary_ip,
srcpath=volname,
scid=sc.id,
fstype=sc.file_server.type,
mo=mo,
)
else:
raise NotImplementedError(
('cannot handle file_server type {} for storage '

Просмотреть файл

@ -55,6 +55,7 @@ logger = logging.getLogger(__name__)
util.setup_logger(logger)
# global defines
_SSH_KEY_PREFIX = 'id_rsa_shipyard_remotefs'
_GLUSTER_DEFAULT_VOLNAME = 'gv0'
def _create_managed_disk(compute_client, rfs, disk_name):
@ -522,6 +523,10 @@ def _create_virtual_machine_extension(
# special processing for gluster (always create these options
# if they don't exist)
if st == 'glusterfs':
try:
server_options.append(so[st]['volume_name'])
except KeyError:
server_options.append(_GLUSTER_DEFAULT_VOLNAME)
try:
server_options.append(so[st]['volume_type'])
except KeyError:
@ -537,7 +542,8 @@ def _create_virtual_machine_extension(
if st in so:
for key in so[st]:
if (st == 'glusterfs' and
(key == 'volume_type' or key == 'transport')):
(key == 'volume_name' or key == 'volume_type' or
key == 'transport')):
continue
server_options.append('{}:{}'.format(key, so[st][key]))
logger.debug('server options: {}'.format(server_options))
@ -557,7 +563,7 @@ def _create_virtual_machine_extension(
},
protected_settings={
'commandToExecute':
'./{bsf} {f}{i}{m}{n}{o}{p}{r}{s}'.format(
'./{bsf} {f}{i}{m}{n}{o}{p}{r}{s}{t}'.format(
bsf=bootstrap_file,
f=' -f {}'.format(
rfs.storage_cluster.vm_disk_map[offset].filesystem),
@ -575,6 +581,11 @@ def _create_virtual_machine_extension(
r=' -r {}'.format(
rfs.storage_cluster.vm_disk_map[offset].raid_level),
s=' -s {}'.format(rfs.storage_cluster.file_server.type),
t=' -t {}'.format(
','.join(rfs.storage_cluster.file_server.mount_options)
if util.is_not_empty(
rfs.storage_cluster.file_server.mount_options)
else ''),
),
'storageAccountName': storage.get_storageaccount(),
'storageAccountKey': storage.get_storageaccount_key(),
@ -602,10 +613,11 @@ def _create_availability_set(compute_client, rfs):
return compute_client.availability_sets.create_or_update(
resource_group_name=rfs.storage_cluster.resource_group,
name=as_name,
# user maximums for ud/fd
parameters=computemodels.AvailabilitySet(
location=rfs.location,
platform_update_domain_count=5,
platform_fault_domain_count=2,
platform_update_domain_count=20,
platform_fault_domain_count=3,
managed=True,
)
)
@ -1109,7 +1121,7 @@ def delete_storage_cluster(
if len(data_disks) > 0:
data_disk_ops.extend(delete_managed_disks(
compute_client, config, data_disks,
resource_group=rfs.storage_cluster.resource_group, wait=False))
resource_group=rfs.managed_disks.resource_group, wait=False))
# wait for nics to delete
logger.debug('waiting for network interfaces to delete')
for op in nic_ops:
@ -1328,8 +1340,9 @@ def expand_storage_cluster(
)
lun += 1
logger.info(
'attaching {} additional data disks to virtual machine {}'.format(
len(entry['new_disks']), vm.id))
('attaching {} additional data disks {} to virtual '
'machine {}').format(
len(entry['new_disks']), entry['new_disks'], vm.name))
# update vm
async_ops.append(
(key, premium, compute_client.virtual_machines.create_or_update(
@ -1368,7 +1381,7 @@ def expand_storage_cluster(
cmd.extend(script_cmd.split())
proc = util.subprocess_nowait_pipe_stdout(cmd)
stdout = proc.communicate()[0]
if stdout is not None:
if util.is_not_empty(stdout):
stdout = stdout.decode('utf8')
if util.on_windows():
stdout = stdout.replace('\n', os.linesep)
@ -1383,6 +1396,7 @@ def expand_storage_cluster(
vm.name, entry['status'])
if entry['status'] == 0:
logger.info(log)
logger.debug(entry['stdout'])
else:
logger.error(log)
logger.error(entry['stdout'])

Просмотреть файл

@ -182,7 +182,7 @@ VirtualNetworkSettings = collections.namedtuple(
)
FileServerSettings = collections.namedtuple(
'FileServerSettings', [
'type', 'mountpoint', 'server_options',
'type', 'mountpoint', 'mount_options', 'server_options',
]
)
InboundNetworkSecurityRule = collections.namedtuple(
@ -2365,11 +2365,13 @@ def fileserver_settings(config, vm_count):
if util.is_none_or_empty(sc_fs_mountpoint):
raise ValueError(
'remote_fs:storage_cluster:file_server must be specified')
sc_mo = _kv_read_checked(conf, 'mount_options')
# get server options
so_conf = _kv_read_checked(conf, 'server_options', {})
return FileServerSettings(
type=sc_fs_type,
mountpoint=sc_fs_mountpoint,
mount_options=sc_mo,
server_options=so_conf,
)

Просмотреть файл

@ -39,6 +39,16 @@ The pool schema is as follows:
}
]
},
"virtual_network": {
"name": "myvnet",
"resource_group": "vnet-in-another-rg",
"create_nonexistant": false,
"address_space": "10.0.0.0/16",
"subnet": {
"name": "subnet-for-batch-vms",
"address_prefix": "10.0.0.0/20"
}
},
"ssh": {
"username": "docker",
"expiry_days": 7,
@ -129,6 +139,24 @@ data defined in `files` prior to pool creation and disable the option above
`data ingress` command.
* (optional) `blobxfer_extra_options` are any extra options to pass to
`blobxfer`.
* (optional) `virtual_network` is the property for specifying an ARM-based
virtual network resource for the pool. This is only available for
UserSubscription Batch accounts.
* (required) `name` is the name of the virtual network
* (optional) `resource_group` containing the virtual network. If
the resource group name is not specified here, the `resource_group`
specified in the `batch` credentials will be used instead.
* (optional) `create_nonexistant` specifies if the virtual network and
subnet should be created if not found. If not specified, this defaults
to `false`.
* (required if creating, optional otherwise) `address_space` is the
allowed address space for the virtual network.
* (required) `subnet` specifies the subnet properties.
* (required) `name` is the subnet name.
* (required) `address_prefix` is the subnet address prefix to use for
allocation Batch compute nodes to. The maximum number of compute nodes
a subnet can support is 4096 which maps roughly to a CIDR mask of
20-bits.
* (optional) `ssh` is the property for creating a user to accomodate SSH
sessions to compute nodes. If this property is absent, then an SSH user is not
created with pool creation.

Просмотреть файл

@ -9,7 +9,7 @@ azure-storage==0.34.0
blobxfer==0.12.1
click==6.7
future==0.16.0
msrest==0.4.5
msrest==0.4.6
msrestazure==0.4.7
pathlib2==2.2.1; python_version < '3.5'
scandir==1.5; python_version < '3.5'

Просмотреть файл

@ -412,9 +412,9 @@ EOF
IFS=':' read -ra sc <<< "$sc_arg"
server_type=${sc[0]}
if [ $server_type == "nfs" ]; then
apt-get install -y -q --no-install-recommends nfs-common
apt-get install -y -q --no-install-recommends nfs-common nfs4-acl-tools
elif [ $server_type == "glusterfs" ]; then
apt-get install -y -q --no-install-recommends glusterfs-client
apt-get install -y -q --no-install-recommends glusterfs-client acl
else
echo "Unknown file server type: $sc_arg"
exit 1
@ -425,10 +425,11 @@ EOF
# install azure storage python dependency
apt-get install -y -q --no-install-recommends \
build-essential libssl-dev libffi-dev libpython3-dev python3-dev python3-pip
pip3 install --no-cache-dir azure-storage==0.33.0
pip3 install --no-cache-dir azure-storage==0.34.0
# install cascade dependencies
if [ $p2penabled -eq 1 ]; then
apt-get install -y -q --no-install-recommends python3-libtorrent pigz
apt-get install -y -q --no-install-recommends \
python3-libtorrent pigz
fi
fi
elif [[ $offer == centos* ]] || [[ $offer == "rhel" ]] || [[ $offer == "oracle-linux" ]]; then
@ -512,14 +513,14 @@ EOF
IFS=':' read -ra sc <<< "$sc_arg"
server_type=${sc[0]}
if [ $server_type == "nfs" ]; then
yum install -y nfs-utils
yum install -y nfs-utils nfs4-acl-tools
systemctl daemon-reload
$rpcbindenable
systemctl start rpcbind
elif [ $server_type == "glusterfs" ]; then
yum install -y epel-release centos-release-gluster38
sed -i -e "s/enabled=1/enabled=0/g" /etc/yum.repos.d/CentOS-Gluster-3.8.repo
yum install -y --enablerepo=centos-gluster38,epel glusterfs-client
yum install -y --enablerepo=centos-gluster38,epel glusterfs-client acl
else
echo "Unknown file server type: $sc_arg"
exit 1
@ -601,14 +602,14 @@ elif [[ $offer == opensuse* ]] || [[ $offer == sles* ]]; then
IFS=':' read -ra sc <<< "$sc_arg"
server_type=${sc[0]}
if [ $server_type == "nfs" ]; then
zypper -n in nfs-client
zypper -n in nfs-client nfs4-acl-tools
systemctl daemon-reload
systemctl enable rpcbind
systemctl start rpcbind
elif [ $server_type == "glusterfs" ]; then
zypper addrepo http://download.opensuse.org/repositories/filesystems/$repodir/filesystems.repo
zypper -n --gpg-auto-import-keys ref
zypper -n in glusterfs
zypper -n in glusterfs acl
else
echo "Unknown file server type: $sc_arg"
exit 1

Просмотреть файл

@ -8,7 +8,6 @@ DEBIAN_FRONTEND=noninteractive
# constants
gluster_brick_mountpath=/gluster/brick
gluster_brick_location=$gluster_brick_mountpath/brick0
gluster_volname=gv0
ipaddress=$(ip addr list eth0 | grep "inet " | cut -d' ' -f6 | cut -d/ -f1)
# vars
@ -22,6 +21,7 @@ optimize_tcp=0
server_options=
premium_storage=0
raid_level=-1
mount_options=
# functions
setup_nfs() {
@ -130,6 +130,10 @@ gluster_poll_for_volume() {
}
setup_glusterfs() {
# parse server options in the format: volname,voltype,transport,key:value,...
IFS=',' read -ra so <<< "$server_options"
local gluster_volname=${so[0]}
# create hosts array
IFS=',' read -ra hosts <<< "$peer_ips"
# master (first host) performs peering
if [ ${hosts[0]} == $ipaddress ]; then
@ -147,9 +151,7 @@ setup_glusterfs() {
# wait for connections
local numnodes=${#hosts[@]}
gluster_poll_for_connections $numnodes
# parse server options in the format: voltype,transport,key:value,...
IFS=',' read -ra so <<< "$server_options"
local voltype=${so[0],,}
local voltype=${so[1],,}
local volarg=
if [ $voltype == "replica" ] || [ $voltype == "stripe" ]; then
volarg="$voltype $numnodes"
@ -157,7 +159,7 @@ setup_glusterfs() {
# allow custom replica and/or stripe counts
volarg=$voltype
fi
local transport=${so[1],,}
local transport=${so[2],,}
if [ -z $transport ]; then
transport="tcp"
fi
@ -173,7 +175,7 @@ setup_glusterfs() {
echo "Creating $voltype gluster volume $gluster_volname ($force$bricks)"
gluster volume create $gluster_volname $volarg transport $transport$bricks $force
# modify volume properties as per input
for e in ${so[@]:2}; do
for e in ${so[@]:3}; do
IFS=':' read -ra kv <<< "$e"
echo "Setting volume option ${kv[@]}"
gluster volume set $gluster_volname ${kv[0]} ${kv[1]}
@ -181,6 +183,23 @@ setup_glusterfs() {
# start volume
echo "Starting gluster volume $gluster_volname"
gluster volume start $gluster_volname
# heal volume if force created with certain volume types
if [ ! -z $force ]; then
if [[ $voltype == replica* ]] || [[ $voltype == disperse* ]]; then
echo "Checking if gluster volume $gluster_volname needs healing"
set +e
gluster volume heal $gluster_volname info
if [ $? -eq 0 ]; then
set -e
gluster volume heal $gluster_volname
# print status after heal
gluster volume heal $gluster_volname info healed
gluster volume heal $gluster_volname info heal-failed
gluster volume heal $gluster_volname info split-brain
fi
set -e
fi
fi
fi
# poll for volume created
@ -238,7 +257,7 @@ setup_glusterfs() {
}
# begin processing
while getopts "h?abf:i:m:no:pr:s:" opt; do
while getopts "h?abf:i:m:no:pr:s:t:" opt; do
case "$opt" in
h|\?)
echo "shipyard_remotefs_bootstrap.sh parameters"
@ -253,6 +272,7 @@ while getopts "h?abf:i:m:no:pr:s:" opt; do
echo "-p premium storage disks"
echo "-r [RAID level] RAID level"
echo "-s [server type] server type"
echo "-t [mount options] mount options"
echo ""
exit 1
;;
@ -286,6 +306,9 @@ while getopts "h?abf:i:m:no:pr:s:" opt; do
s)
server_type=${OPTARG,,}
;;
t)
mount_options=$OPTARG
;;
esac
done
shift $((OPTIND-1))
@ -336,7 +359,7 @@ EOF
# install required server_type software
apt-get update
if [ $server_type == "nfs" ]; then
apt-get install -y --no-install-recommends nfs-kernel-server
apt-get install -y --no-install-recommends nfs-kernel-server nfs4-acl-tools
# patch buggy nfs-mountd.service unit file
# https://bugs.launchpad.net/ubuntu/+source/nfs-utils/+bug/1590799
set +e
@ -420,6 +443,11 @@ fi
# check if disks are already in raid set
raid_resized=0
if [ $raid_level -ge 0 ]; then
# redirect mountpath if gluster for bricks
saved_mp=$mountpath
if [ $server_type == "glusterfs" ]; then
mountpath=$gluster_brick_mountpath
fi
format_target=0
md_preexist=0
if [ $filesystem == "btrfs" ]; then
@ -534,6 +562,9 @@ if [ $raid_level -ge 0 ]; then
if [ -z $target_uuid ]; then
read target_uuid < <(blkid ${all_raid_disks[0]} | awk -F "[= ]" '{print $3}' | sed 's/\"//g')
fi
# restore mountpath
mountpath=$saved_mp
unset saved_mp
fi
# create filesystem on target device
@ -587,18 +618,24 @@ if [ $attach_disks -eq 0 ]; then
# add fstab entry
if [ $add_fstab -eq 1 ]; then
echo "Adding $target_uuid to mountpoint $mountpath to /etc/fstab"
# construct mount options
if [ -z $mount_options ]; then
mount_options="defaults"
else
mount_options="defaults,$mount_options"
fi
if [ $premium_storage -eq 1 ]; then
# disable barriers due to RO cache
if [ $filesystem == "btrfs" ]; then
mo=",nobarrier"
mount_options+=",nobarrier"
else
mo=",barrier=0"
mount_options+=",barrier=0"
fi
else
# enable discard to save cost on standard storage
mo=",discard"
mount_options+=",discard"
fi
echo "UUID=$target_uuid $mountpath $filesystem defaults,noatime,nodiratime${mo} 0 2" >> /etc/fstab
echo "UUID=$target_uuid $mountpath $filesystem ${mount_options} 0 2" >> /etc/fstab
fi
# create mountpath
mkdir -p $mountpath

Просмотреть файл

@ -75,7 +75,6 @@ elif [ $server_type == "glusterfs" ]; then
echo "glusterfs service status:"
systemctl status glusterfs-server
echo ""
echo "volume info:"
gluster volume info all
echo ""
gluster volume status all detail