Pool <-> storage cluster linkage checkpoint

This commit is contained in:
Fred Park 2017-03-08 23:43:16 -08:00
Родитель 91403de98f
Коммит 5fcddad7ea
9 изменённых файлов: 224 добавлений и 64 удалений

Просмотреть файл

@ -118,8 +118,9 @@
},
"mystoragecluster": {
"volume_driver": "storage_cluster",
"container_path": "$AZ_BATCH_NODE_SHARED_DIR/nfs",
"container_path": "$AZ_BATCH_NODE_SHARED_DIR/mystoragecluster",
"mount_options": [
"noatime"
]
}
}

Просмотреть файл

@ -28,7 +28,7 @@
"x509_cert_sha1_thumbprint": "",
"user": "",
"password": "",
"endpoint": "https://management.azure.com/",
"endpoint": "https://management.core.windows.net/",
"token_cache": {
"enabled": true,
"filename": ""
@ -50,7 +50,7 @@
"x509_cert_sha1_thumbprint": "",
"user": "",
"password": "",
"endpoint": "https://batch.azure.com/",
"endpoint": "https://batch.core.windows.net/",
"token_cache": {
"enabled": true,
"filename": ""

Просмотреть файл

@ -207,7 +207,11 @@ def create_aad_credentials(ctx, aad_settings):
if util.is_not_empty(aad_password):
raise ValueError('cannot specify both cert auth and password')
if settings.verbose(ctx.config):
logger.debug('authenticating with certificate')
logger.debug(
('authenticating with certificate, endpoint={} directoryid={} '
'appid={} cert_thumbprint={}').format(
endpoint, aad_directory_id, aad_application_id,
aad_cert_thumbprint))
context = adal.AuthenticationContext(
'{}/{}'.format(_LOGIN_AUTH_URI, aad_directory_id))
return msrestazure.azure_active_directory.AdalAuthentication(
@ -223,7 +227,10 @@ def create_aad_credentials(ctx, aad_settings):
raise ValueError(
'Cannot specify both an AAD Service Principal and User')
if settings.verbose(ctx.config):
logger.debug('authenticating with auth key')
logger.debug(
('authenticating with auth key, endpoint={} directoryid={} '
'appid={}').format(
endpoint, aad_directory_id, aad_application_id))
return azure.common.credentials.ServicePrincipalCredentials(
aad_application_id,
aad_auth_key,
@ -232,7 +239,10 @@ def create_aad_credentials(ctx, aad_settings):
)
elif util.is_not_empty(aad_password):
if settings.verbose(ctx.config):
logger.debug('authenticating with username and password')
logger.debug(
('authenticating with username and password, endpoint={} '
'directoryid={} username={}').format(
endpoint, aad_directory_id, aad_user))
try:
return azure.common.credentials.UserPassCredentials(
username=aad_user,
@ -246,7 +256,9 @@ def create_aad_credentials(ctx, aad_settings):
'Do not pass an AAD password and try again.'))
else:
if settings.verbose(ctx.config):
logger.debug('authenticating with device code')
logger.debug(
('authenticating with device code, endpoint={} '
'directoryid={}').format(endpoint, aad_directory_id))
return DeviceCodeAuthentication(
context=adal.AuthenticationContext(
'{}/{}'.format(_LOGIN_AUTH_URI, aad_directory_id)),

Просмотреть файл

@ -43,13 +43,14 @@ except ImportError:
import uuid
# non-stdlib imports
import azure.batch.models as batchmodels
import azure.mgmt.network.models as networkmodels
# local imports
from . import batch
from . import crypto
from . import data
from . import keyvault
from . import network
from . import remotefs
from . import resource
from . import settings
from . import storage
from . import util
@ -442,7 +443,8 @@ def _add_pool(
# retrieve settings
pool_settings = settings.pool_settings(config)
bc = settings.credentials_batch(config)
vnet_subnet_id = None
vnet = None
subnet = None
# check for virtual network settings
if (pool_settings.virtual_network is not None and
util.is_not_empty(pool_settings.virtual_network.name)):
@ -451,7 +453,7 @@ def _add_pool(
'Invalid subnet name on virtual network {}'.format(
pool_settings.virtual_network.name))
# create virtual network and subnet if specified
vnet, subnet = network.create_virtual_network_and_subnet(
vnet, subnet = resource.create_virtual_network_and_subnet(
network_client, bc.resource_group, bc.location,
pool_settings.virtual_network)
# ensure address prefix for subnet is valid
@ -482,14 +484,88 @@ def _add_pool(
pool_settings.vm_count)):
raise RuntimeError('Pool deployment rejected by user')
logger.info('using virtual network subnet id: {}'.format(subnet.id))
vnet_subnet_id = subnet.id
else:
logger.debug('no virtual network settings specified')
# TODO construct mounts for storage_cluster_mount
import sys
sys.exit(1)
# construct fstab mount for storage_cluster_mount
fstab_mount = None
if storage_cluster_mount:
# ensure usersubscription account
if not bc.user_subscription:
raise RuntimeError(
'{} account is not a UserSubscription account'.format(
bc.account))
# check for vnet/subnet presence
if vnet is None or subnet is None:
raise RuntimeError(
'cannot mount a storage cluster without a valid virtual '
'network or subnet')
# get remotefs settings
rfs = settings.remotefs_settings(config)
sc = rfs.storage_cluster
# iterate through shared data volumes and fine storage clusters
sdv = settings.global_resources_shared_data_volumes(config)
if (sc.id not in sdv or
not settings.is_shared_data_volume_storage_cluster(
sdv, sc.id)):
raise RuntimeError(
'No storage cluster {} found in configuration'.format(sc.id))
# check for same vnet
if vnet.name != sc.virtual_network.name:
raise RuntimeError(
'cannot link storage cluster {} on virtual '
'network {} with pool virtual network {}'.format(
sc.id, sc.virtual_network.name, vnet.name))
# get vm count
if sc.vm_count < 1:
raise RuntimeError(
'storage cluster {} vm_count {} is invalid'.format(
sc.id, sc.vm_count))
# get fileserver type
if sc.file_server.type == 'nfs':
# query first vm for info
vm_name = '{}-vm{}'.format(sc.hostname_prefix, 0)
vm = compute_client.virtual_machines.get(
resource_group_name=rfs.resource_group,
vm_name=vm_name,
)
_, pip = resource.get_nic_and_pip_from_virtual_machine(
network_client, rfs.resource_group, vm)
# get static ip setting
if (pip.public_ip_allocation_method ==
networkmodels.IPAllocationMethod.static):
# use ip for mount command
remote_ip = pip.ip_address
else:
# use fqdn dns name for mount command
remote_ip = pip.dns_settings.fqdn
# construct mount options
mo = '_netdev,auto'
amo = settings.shared_data_volume_mount_options(sdv, sc.id)
if util.is_not_empty(amo):
mo = ','.join((mo, ','.join(amo)))
# construct mount string for fstab
fstab_mount = (
'{remoteip}:{srcpath} $AZ_BATCH_NODE_SHARED_DIR/{scid} '
'{fstype} {mo} 0 2').format(
remoteip=remote_ip,
srcpath=sc.file_server.mountpoint,
scid=sc.id,
fstype=sc.file_server.type,
mo=mo,
)
else:
raise NotImplementedError(
('cannot handle file_server type {} for storage '
'cluster {}').format(sc.file_server.type, sc.id))
if util.is_none_or_empty(fstab_mount):
raise RuntimeError(
('Could not construct an fstab mount entry for storage '
'cluster {}').format(sc.id))
# log config
if settings.verbose(config):
logger.debug('storage cluster {} fstab mount: {}'.format(
sc.id, fstab_mount))
del storage_cluster_mount
# add encryption cert to account if specified
encrypt = settings.batch_shipyard_encryption_enabled(config)
if encrypt:
@ -567,7 +643,7 @@ def _add_pool(
del _rflist
# create start task commandline
start_task = [
'{npf} {a}{b}{d}{e}{f}{g}{n}{o}{p}{r}{s}{t}{v}{w}{x}'.format(
'{npf} {a}{b}{d}{e}{f}{g}{m}{n}{o}{p}{r}{s}{t}{v}{w}{x}'.format(
npf=_NODEPREP_FILE[0],
a=' -a' if azurefile_vd else '',
b=' -b {}'.format(block_for_gr) if block_for_gr else '',
@ -575,6 +651,8 @@ def _add_pool(
e=' -e {}'.format(pfx.sha1) if encrypt else '',
f=' -f' if gluster_on_compute else '',
g=' -g {}'.format(gpu_env) if gpu_env is not None else '',
m=' -m {}'.format(sc.id) if util.is_not_empty(
fstab_mount) else '',
n=' -n' if settings.can_tune_tcp(pool_settings.vm_size) else '',
o=' -o {}'.format(pool_settings.offer),
p=' -p {}'.format(
@ -659,6 +737,17 @@ def _add_pool(
)
)
del psa
if subnet is not None:
pool.network_configuration = batchmodels.NetworkConfiguration(
subnet_id=subnet.id,
)
if util.is_not_empty(fstab_mount):
pool.start_task.environment_settings.append(
batchmodels.EnvironmentSetting(
'SHIPYARD_STORAGE_CLUSTER_FSTAB',
fstab_mount
)
)
# add optional environment variables
if bs.store_timing_metrics:
pool.start_task.environment_settings.append(

Просмотреть файл

@ -44,7 +44,7 @@ import azure.mgmt.resource.resources.models as rgmodels
import msrestazure.azure_exceptions
# local imports
from . import crypto
from . import network
from . import resource
from . import settings
from . import storage
from . import util
@ -632,7 +632,7 @@ def create_storage_cluster(
# upload scripts to blob storage for customscript
blob_urls = storage.upload_for_remotefs(blob_client, remotefs_files)
# create virtual network and subnet if specified
vnet, subnet = network.create_virtual_network_and_subnet(
vnet, subnet = resource.create_virtual_network_and_subnet(
network_client, rfs.resource_group, rfs.location,
rfs.storage_cluster.virtual_network)
@ -714,42 +714,6 @@ def create_storage_cluster(
vm.hardware_profile.vm_size))
def _get_nic_and_pip_from_virtual_machine(network_client, rfs, vm):
# type: (azure.mgmt.network.NetworkManagementClient,
# settings.RemoteFsSettings, computemodels.VirtualMachine) ->
# Tuple[networkmodels.NetworkInterface,
# networkmodels.PublicIPAddress]
"""Get network interface and public ip from a virtual machine
:param azure.mgmt.network.NetworkManagementClient network_client:
network client
:param rfs settings.RemoteFsSettings: remote fs settings
:param vm computemodels.VirtualMachine: vm
:rtype: tuple
:return: (nic, pip)
"""
# get nic
nic_id = vm.network_profile.network_interfaces[0].id
tmp = nic_id.split('/')
if tmp[-2] != 'networkInterfaces':
raise RuntimeError('could not parse network interface id')
nic_name = tmp[-1]
nic = network_client.network_interfaces.get(
resource_group_name=rfs.resource_group,
network_interface_name=nic_name,
)
# get public ip
pip_id = nic.ip_configurations[0].public_ip_address.id
tmp = pip_id.split('/')
if tmp[-2] != 'publicIPAddresses':
raise RuntimeError('could not parse public ip address id')
pip_name = tmp[-1]
pip = network_client.public_ip_addresses.get(
resource_group_name=rfs.resource_group,
public_ip_address_name=pip_name,
)
return (nic, pip)
def _get_resource_names_from_virtual_machine(
compute_client, network_client, rfs, vm, nic=None, pip=None):
# type: (azure.mgmt.compute.ComputeManagementClient,
@ -1530,8 +1494,8 @@ def stat_storage_cluster(
for status in disk.statuses:
diskstates.append(status.code)
# get nic/pip connected to vm
nic, pip = _get_nic_and_pip_from_virtual_machine(
network_client, rfs, vm)
nic, pip = resource.get_nic_and_pip_from_virtual_machine(
network_client, rfs.resource_group, vm)
# get resource names (pass cached data to prevent another lookup)
_, _, subnet, vnet, nsg, slb = \
_get_resource_names_from_virtual_machine(
@ -1589,6 +1553,8 @@ def stat_storage_cluster(
vm.instance_view.platform_update_domain,
vm.instance_view.platform_fault_domain),
'fqdn': pip.dns_settings.fqdn,
'static_ip': pip.public_ip_allocation_method ==
networkmodels.IPAllocationMethod.static,
'public_ip_address': pip.ip_address,
'private_ip_address': nic.ip_configurations[0].private_ip_address,
'admin_username': vm.os_profile.admin_username,
@ -1648,7 +1614,8 @@ def _get_ssh_info(
raise
# get pip connected to vm
if pip is None:
_, pip = _get_nic_and_pip_from_virtual_machine(network_client, rfs, vm)
_, pip = resource.get_nic_and_pip_from_virtual_machine(
network_client, rfs.resource_group, vm)
# connect to vm
ssh_priv_key = pathlib.Path(
rfs.storage_cluster.ssh.generated_file_export_path, _SSH_KEY_PREFIX)

Просмотреть файл

@ -139,3 +139,39 @@ def create_virtual_network_and_subnet(
virtual_network.address_space.address_prefixes,
vnet_settings.subnet_name, subnet.address_prefix))
return (virtual_network, subnet)
def get_nic_and_pip_from_virtual_machine(network_client, resource_group, vm):
# type: (azure.mgmt.network.NetworkManagementClient, str,
# computemodels.VirtualMachine) ->
# Tuple[networkmodels.NetworkInterface,
# networkmodels.PublicIPAddress]
"""Get network interface and public ip from a virtual machine
:param azure.mgmt.network.NetworkManagementClient network_client:
network client
:param str resource_group: resource group name
:param vm computemodels.VirtualMachine: vm
:rtype: tuple
:return: (nic, pip)
"""
# get nic
nic_id = vm.network_profile.network_interfaces[0].id
tmp = nic_id.split('/')
if tmp[-2] != 'networkInterfaces':
raise RuntimeError('could not parse network interface id')
nic_name = tmp[-1]
nic = network_client.network_interfaces.get(
resource_group_name=resource_group,
network_interface_name=nic_name,
)
# get public ip
pip_id = nic.ip_configurations[0].public_ip_address.id
tmp = pip_id.split('/')
if tmp[-2] != 'publicIPAddresses':
raise RuntimeError('could not parse public ip address id')
pip_name = tmp[-1]
pip = network_client.public_ip_addresses.get(
resource_group_name=resource_group,
public_ip_address_name=pip_name,
)
return (nic, pip)

Просмотреть файл

@ -599,11 +599,13 @@ def raw_credentials(config, omit_keyvault):
return conf
def _aad_credentials(conf, default_endpoint=None):
def _aad_credentials(
conf, default_endpoint=None, default_token_cache_file=None):
# type: (dict, str) -> AADSettings
"""Retrieve AAD Settings
:param dict config: configuration object
:param str default_endpoint: default endpoint
:param str default_token_cache_file: default token cache file
:rtype: AADSettings
:return: AAD settings
"""
@ -626,7 +628,7 @@ def _aad_credentials(conf, default_endpoint=None):
if token_cache_enabled:
token_cache_file = _kv_read_checked(
conf['aad']['token_cache'], 'filename',
'.batch_shipyard_aad_management_token.json')
default_token_cache_file)
else:
token_cache_file = None
return AADSettings(
@ -669,7 +671,13 @@ def credentials_keyvault(config):
keyvault_credentials_secret_id = _kv_read_checked(
conf, 'credentials_secret_id')
return KeyVaultCredentialsSettings(
aad=_aad_credentials(conf, default_endpoint='https://vault.azure.net'),
aad=_aad_credentials(
conf,
default_endpoint='https://vault.azure.net',
default_token_cache_file=(
'.batch_shipyard_aad_keyvault_token.json'
),
),
keyvault_uri=keyvault_uri,
keyvault_credentials_secret_id=keyvault_credentials_secret_id,
)
@ -689,7 +697,12 @@ def credentials_management(config):
subscription_id = _kv_read_checked(conf, 'subscription_id')
return ManagementCredentialsSettings(
aad=_aad_credentials(
conf, default_endpoint='https://management.core.windows.net/'),
conf,
default_endpoint='https://management.core.windows.net/',
default_token_cache_file=(
'.batch_shipyard_aad_management_token.json'
),
),
subscription_id=subscription_id,
)
@ -716,7 +729,12 @@ def credentials_batch(config):
location = account_service_url.split('.')[1]
return BatchCredentialsSettings(
aad=_aad_credentials(
conf, default_endpoint='https://batch.core.windows.net/'),
conf,
default_endpoint='https://batch.core.windows.net/',
default_token_cache_file=(
'.batch_shipyard_aad_batch_token.json'
),
),
account=conf['account'],
account_key=account_key,
account_service_url=conf['account_service_url'],

Просмотреть файл

@ -44,9 +44,10 @@ p2penabled=0
prefix=
privatereg=
sku=
sc_id=0
version=
while getopts "h?ab:de:fg:no:p:r:s:t:v:wx:" opt; do
while getopts "h?ab:de:fg:nm:o:p:r:s:t:v:wx:" opt; do
case "$opt" in
h|\?)
echo "shipyard_nodeprep.sh parameters"
@ -57,6 +58,7 @@ while getopts "h?ab:de:fg:no:p:r:s:t:v:wx:" opt; do
echo "-e [thumbprint] encrypted credentials with cert"
echo "-f set up glusterfs cluster"
echo "-g [nv-series:driver file:nvidia docker pkg] gpu support"
echo "-m [scid] mount storage cluster"
echo "-n optimize network TCP settings"
echo "-o [offer] VM offer"
echo "-p [prefix] storage container prefix"
@ -87,6 +89,9 @@ while getopts "h?ab:de:fg:no:p:r:s:t:v:wx:" opt; do
g)
gpu=$OPTARG
;;
m)
sc_id=$OPTARG
;;
n)
networkopt=1
;;
@ -583,6 +588,37 @@ if [ ! -z ${DOCKER_LOGIN_USERNAME+x} ]; then
docker login -u $DOCKER_LOGIN_USERNAME -p $DOCKER_LOGIN_PASSWORD $DOCKER_LOGIN_SERVER
fi
# mount any storage clusters
if [ ! -z $sc_id ]; then
mountpoint=$AZ_BATCH_NODE_SHARED_DIR/$sc_id
echo "Creating host directory for storage cluster $sc_id at $mountpoint"
mkdir -p $mountpoint
chmod 775 $mountpoint
echo "Adding $mountpoint to fstab"
echo "$SHIPYARD_STORAGE_CLUSTER_FSTAB" >> /etc/fstab
tail -n1 /etc/fstab
echo "Mounting $mountpoint"
START=$(date -u +"%s")
set +e
while :
do
mount $mountpoint
if [ $? -eq 0 ]; then
break
else
NOW=$(date -u +"%s")
DIFF=$((($NOW-$START)/60))
# fail after 5 minutes of attempts
if [ $DIFF -ge 5 ]; then
echo "Could not mount storage cluster $sc_id on: $mountpoint"
exit 1
fi
sleep 1
fi
done
set -e
fi
# touch node prep finished file to preserve idempotency
touch $nodeprepfinished
# touch cascade failed file, this will be removed once cascade is successful

Просмотреть файл

@ -94,6 +94,7 @@ class CliContext(object):
skip_global_config=False, skip_pool_config=True, fs_storage=True)
self.resource_client, self.compute_client, self.network_client, \
_, _ = convoy.clients.create_arm_clients(self)
self.blob_client, _, _ = convoy.clients.create_storage_clients()
self._cleanup_after_initialize(
skip_global_config=False, skip_pool_config=True)