Slurm on Batch feature

- Package and use Slurm 18.08 instead of default from distro repo - Slurm "master" contains separate controller and login nodes - Integrate RemoteFS shared file system into Slurm cluster - Auto feature tagging on Slurm nodes - Support CentOS 7, Ubuntu 16.04, Ubuntu 18.04 Batch pools as Slurm node targets - Unify login and Batch pools on cluster user based on login user - Auto provision passwordless SSH user on compute nodes with login user context - Add slurm cluster commands, including orchestrate command - Add separate SSH for controller, login, nodes - Add Slurm configuration doc - Add Slurm guide - Add Slurm recipe - Update usage doc - Remove deprecated MSI VM extension from monitoring and federation - Fix pool nodes count on non-existent pool - Refactor SSH info to allow offsets - Add fs cluster orchestrate command
2019-01-15 09:56:03 -08:00 · 2019-01-15 09:56:03 -08:00 · 314037f76f
--- a/config_templates/credentials.yaml
+++ b/config_templates/credentials.yaml
@ -108,3 +108,7 @@ credentials:
      admin:
        username: grafana_username
        password: grafana_user_password
+  # slurm credentials
+  slurm:
+    db_password: db_password
+    # TODO munge key
--- a/config_templates/slurm.yaml
+++ b/config_templates/slurm.yaml
@ -0,0 +1,111 @@
+slurm:
+  storage_account_settings: mystorageaccount
+  location: <Azure region, e.g., eastus>
+  resource_group: my-slurm-rg
+  cluster_id: slurm
+  controller:
+    ssh:
+      username: shipyard
+      ssh_public_key: /path/to/rsa/publickey.pub
+      ssh_public_key_data: ssh-rsa ...
+      ssh_private_key: /path/to/rsa/privatekey
+      generated_file_export_path: null
+    public_ip:
+      enabled: true
+      static: false
+    virtual_network:
+      name: myvnet
+      resource_group: my-vnet-resource-group
+      existing_ok: false
+      address_space: 10.0.0.0/16
+      subnet:
+        name: my-slurm-controller-subnet
+        address_prefix: 10.0.1.0/24
+    network_security:
+      ssh:
+      - '*'
+      custom_inbound_rules:
+        myrule:
+          destination_port_range: 5000-5001
+          protocol: '*'
+          source_address_prefix:
+          - 1.2.3.4
+          - 5.6.7.0/24
+    vm_size: STANDARD_D2_V2
+    vm_count: 2
+    accelerated_networking: false
+    additional_prep_script: /path/to/some/script-controller.sh
+  login:
+    ssh:
+      username: shipyard
+      ssh_public_key: /path/to/rsa/publickey.pub
+      ssh_public_key_data: ssh-rsa ...
+      ssh_private_key: /path/to/rsa/privatekey
+      generated_file_export_path: null
+    public_ip:
+      enabled: true
+      static: false
+    virtual_network:
+      name: myvnet
+      resource_group: my-vnet-resource-group
+      existing_ok: false
+      address_space: 10.0.0.0/16
+      subnet:
+        name: my-slurm-login-subnet
+        address_prefix: 10.0.2.0/24
+    network_security:
+      ssh:
+      - '*'
+      custom_inbound_rules:
+        myrule:
+          destination_port_range: 5000-5001
+          protocol: '*'
+          source_address_prefix:
+          - 1.2.3.4
+          - 5.6.7.0/24
+    vm_size: STANDARD_D4_V2
+    vm_count: 1
+    accelerated_networking: false
+    additional_prep_script: /path/to/some/script-login.sh
+  shared_data_volumes:
+    nfs_server:
+      mount_path: /shared
+      store_slurmctld_state: true
+  slurm_options:
+    idle_reclaim_time: 00:15:00
+    elastic_partitions:
+      partition_1:
+        batch_pools:
+          mypool1:
+            account_service_url: https://...
+            compute_node_type: dedicated
+            max_compute_nodes: 32
+            weight: 0
+            features:
+            - arbitrary_constraint_1
+            reclaim_exclude_num_nodes: 8
+          mypool2:
+            account_service_url: https://...
+            compute_node_type: low_priority
+            max_compute_nodes: 128
+            weight: 1
+            features:
+            - arbitrary_constraint_2
+            reclaim_exclude_num_nodes: 0
+        max_runtime_limit: null
+        default: true
+      partition_2:
+        batch_pools:
+          mypool3:
+            account_service_url: https://...
+            compute_node_type: low_priority
+            max_compute_nodes: 256
+            weight: 2
+            features: []
+            reclaim_exclude_num_nodes: 0
+        max_runtime_limit: 1.12:00:00
+        default: false
+    unmanaged_partitions:
+      - partition: 'PartitionName=onprem Nodes=onprem-[0-31] Default=No MaxTime=INFINITE State=UP'
+        nodes:
+          - 'NodeName=onprem-[0-31] CPUs=512 Sockets=1 CoresPerSocket=8 ThreadsPerCore=2 RealMemory=512128 State=UNKNOWN'
--- a/convoy/batch.py
+++ b/convoy/batch.py
@ -2798,7 +2798,10 @@ def get_node_counts(batch_client, config, pool_id=None):
                account_list_pool_node_counts_options=batchmodels.
                AccountListPoolNodeCountsOptions(
                    filter='poolId eq \'{}\''.format(pool_id)))
-            pc = list(pc)[0]
+            try:
+                pc = list(pc)[0]
+            except IndexError:
+                raise RuntimeError('pool {} does not exist'.format(pool_id))
    except batchmodels.BatchErrorException as ex:
        if 'pool does not exist' in ex.message.value:
            logger.error('{} pool does not exist'.format(pool_id))
--- a/convoy/crypto.py
+++ b/convoy/crypto.py
@ -55,6 +55,9 @@ _SSH_KEY_PREFIX = 'id_rsa_shipyard'
 _REMOTEFS_SSH_KEY_PREFIX = '{}_remotefs'.format(_SSH_KEY_PREFIX)
 _MONITORING_SSH_KEY_PREFIX = '{}_monitoring'.format(_SSH_KEY_PREFIX)
 _FEDERATION_SSH_KEY_PREFIX = '{}_federation'.format(_SSH_KEY_PREFIX)
+_SLURM_CONTROLLER_SSH_KEY_PREFIX = '{}_slurm_controller'.format(
+    _SSH_KEY_PREFIX)
+_SLURM_LOGIN_SSH_KEY_PREFIX = '{}_slurm_login'.format(_SSH_KEY_PREFIX)
 # named tuples
 PfxSettings = collections.namedtuple(
    'PfxSettings', [
@ -99,6 +102,19 @@ def get_federation_ssh_key_prefix():
    return _FEDERATION_SSH_KEY_PREFIX


+def get_slurm_ssh_key_prefix(kind):
+    # type: (str) -> str
+    """Get slurm SSH key prefix
+    :param str kind: kind
+    :rtype: str
+    :return: ssh key prefix for slurm
+    """
+    if kind == 'controller':
+        return _SLURM_CONTROLLER_SSH_KEY_PREFIX
+    else:
+        return _SLURM_LOGIN_SSH_KEY_PREFIX
+
+
 def generate_rdp_password():
    # type: (None) -> str
    """Generate an RDP password
--- a/convoy/federation.py
+++ b/convoy/federation.py
@ -80,7 +80,7 @@ def _create_virtual_machine_extension(
    vm_ext_name = settings.generate_virtual_machine_extension_name(
        vm_resource, offset)
    # try to get storage account resource group
-    ssel = settings.federation_storage_account_settings(config)
+    ssel = settings.other_storage_account_settings(config, 'federation')
    rg = settings.credentials_storage(config, ssel).resource_group
    # construct bootstrap command
    cmd = './{bsf}{a}{log}{p}{r}{s}{v}'.format(
@ -348,18 +348,10 @@ def create_federation_proxy(
        )
        fqdn = pip.dns_settings.fqdn
        ipinfo = 'fqdn={} public_ip_address={}'.format(fqdn, pip.ip_address)
-    # install msi vm extension
-    async_ops['vmext'] = {}
-    async_ops['vmext'][0] = resource.AsyncOperation(
-        functools.partial(
-            resource.create_msi_virtual_machine_extension, compute_client, fs,
-            vms[0].name, 0, settings.verbose(config)),
-        max_retries=0,
-    )
-    logger.debug('waiting for virtual machine msi extensions to provision')
    for offset in async_ops['vmext']:
        async_ops['vmext'][offset].result()
    # install vm extension
+    async_ops['vmext'] = {}
    async_ops['vmext'][0] = resource.AsyncOperation(
        functools.partial(
            _create_virtual_machine_extension, compute_client, config, fs,
--- a/convoy/fleet.py
+++ b/convoy/fleet.py
@ -54,6 +54,7 @@ from . import monitor
 from . import remotefs
 from . import resource
 from . import settings
+from . import slurm
 from . import storage
 from . import util
 from .version import __version__
@ -296,6 +297,39 @@ _FEDERATIONSERVICES_FILE = (
 _ALL_FEDERATION_FILES = [
    _FEDERATIONPREP_FILE, _FEDERATIONSERVICES_FILE,
 ]
+_SLURMMASTERPREP_FILE = (
+    'shipyard_slurm_master_bootstrap.sh',
+    pathlib.Path(_ROOT_PATH, 'scripts/shipyard_slurm_master_bootstrap.sh')
+)
+_SLURMCOMPUTENODEPREP_FILE = (
+    'shipyard_slurm_computenode_nodeprep.sh',
+    pathlib.Path(_ROOT_PATH, 'scripts/shipyard_slurm_computenode_nodeprep.sh')
+)
+_SLURMPY_FILE = (
+    'slurm.py',
+    pathlib.Path(_ROOT_PATH, 'slurm/slurm.py')
+)
+_SLURMREQ_FILE = (
+    'requirements.txt',
+    pathlib.Path(_ROOT_PATH, 'slurm/requirements.txt')
+)
+_SLURMCONF_FILE = (
+    'slurm.conf',
+    pathlib.Path(_ROOT_PATH, 'slurm/slurm.conf')
+)
+_SLURMDBDCONF_FILE = (
+    'slurmdbd.conf',
+    pathlib.Path(_ROOT_PATH, 'slurm/slurmdbd.conf')
+)
+_SLURMDBSQL_FILE = (
+    'slurmdb.sql',
+    pathlib.Path(_ROOT_PATH, 'slurm/slurmdb.sql')
+)
+_CONFIGURABLE_SLURM_FILES = {
+    'slurm': _SLURMCONF_FILE,
+    'slurmdbd': _SLURMDBDCONF_FILE,
+    'slurmdbsql': _SLURMDBSQL_FILE,
+}


 def initialize_globals(verbose):
@ -771,9 +805,6 @@ def _create_storage_cluster_mount_args(
    :rtype: tuple
    :return: (fstab mount, storage cluster arg)
    """
-    fstab_mount = None
-    sc_arg = None
-    ba, _ = batch.get_batch_account(batch_mgmt_client, config)
    # check for vnet/subnet presence
    if util.is_none_or_empty(subnet_id):
        raise RuntimeError(
@ -782,15 +813,9 @@ def _create_storage_cluster_mount_args(
    # get remotefs settings
    rfs = settings.remotefs_settings(config, sc_id)
    sc = rfs.storage_cluster
-    # iterate through shared data volumes and fine storage clusters
-    sdv = settings.global_resources_shared_data_volumes(config)
-    if (sc_id not in sdv or
-            not settings.is_shared_data_volume_storage_cluster(
-                sdv, sc_id)):
-        raise RuntimeError(
-            'No storage cluster {} found in configuration'.format(sc_id))
-    vnet_subid, vnet_rg, _, vnet_name, subnet_name = _explode_arm_subnet_id(
-        subnet_id)
+    # perform checks
+    vnet_subid, vnet_rg, _, vnet_name, subnet_name = \
+        util.explode_arm_subnet_id(subnet_id)
    # check for same vnet name
    if vnet_name.lower() != sc.virtual_network.name.lower():
        raise RuntimeError(
@ -804,6 +829,7 @@ def _create_storage_cluster_mount_args(
            '{} with pool virtual network in resource group {}'.format(
                sc_id, sc.virtual_network.resource_group, vnet_rg))
    # cross check vnet subscription id
+    ba, _ = batch.get_batch_account(batch_mgmt_client, config)
    _ba_tmp = ba.id.lower().split('/')
    if vnet_subid.lower() != _ba_tmp[2]:
        raise RuntimeError(
@ -811,152 +837,12 @@ def _create_storage_cluster_mount_args(
            '{} with pool virtual network in subscription {}'.format(
                sc_id, vnet_subid, _ba_tmp[2]))
    del _ba_tmp
-    # get vm count
-    if sc.vm_count < 1:
-        raise RuntimeError(
-            'storage cluster {} vm_count {} is invalid'.format(
-                sc_id, sc.vm_count))
-    # get fileserver type
-    if sc.file_server.type == 'nfs':
-        # query first vm for info
-        vm_name = settings.generate_virtual_machine_name(sc, 0)
-        vm = compute_client.virtual_machines.get(
-            resource_group_name=sc.resource_group,
-            vm_name=vm_name,
-        )
-        nic = resource.get_nic_from_virtual_machine(
-            network_client, sc.resource_group, vm)
-        # get private ip of vm
-        remote_ip = nic.ip_configurations[0].private_ip_address
-        # construct mount options
-        mo = '_netdev,noauto,nfsvers=4,intr'
-        amo = settings.shared_data_volume_mount_options(sdv, sc_id)
-        if util.is_not_empty(amo):
-            if 'udp' in mo:
-                raise RuntimeError(
-                    ('udp cannot be specified as a mount option for '
-                     'storage cluster {}').format(sc_id))
-            if 'auto' in mo:
-                raise RuntimeError(
-                    ('auto cannot be specified as a mount option for '
-                     'storage cluster {}').format(sc_id))
-            if any([x.startswith('nfsvers=') for x in amo]):
-                raise RuntimeError(
-                    ('nfsvers cannot be specified as a mount option for '
-                     'storage cluster {}').format(sc_id))
-            if any([x.startswith('port=') for x in amo]):
-                raise RuntimeError(
-                    ('port cannot be specified as a mount option for '
-                     'storage cluster {}').format(sc_id))
-            mo = ','.join((mo, ','.join(amo)))
-        # construct mount string for fstab
-        fstab_mount = (
-            '{remoteip}:{srcpath} {hmp}/{scid} '
-            '{fstype} {mo} 0 2').format(
-                remoteip=remote_ip,
-                srcpath=sc.file_server.mountpoint,
-                hmp=settings.get_host_mounts_path(False),
-                scid=sc_id,
-                fstype=sc.file_server.type,
-                mo=mo)
-    elif sc.file_server.type == 'glusterfs':
-        # walk vms and find non-overlapping ud/fds
-        primary_ip = None
-        primary_ud = None
-        primary_fd = None
-        backup_ip = None
-        backup_ud = None
-        backup_fd = None
-        vms = {}
-        # first pass, attempt to populate all ip, ud/fd
-        for i in range(sc.vm_count):
-            vm_name = settings.generate_virtual_machine_name(sc, i)
-            vm = compute_client.virtual_machines.get(
-                resource_group_name=sc.resource_group,
-                vm_name=vm_name,
-                expand=compute_client.virtual_machines.models.
-                InstanceViewTypes.instance_view,
-            )
-            nic = resource.get_nic_from_virtual_machine(
-                network_client, sc.resource_group, vm)
-            vms[i] = (vm, nic)
-            # get private ip and ud/fd of vm
-            remote_ip = nic.ip_configurations[0].private_ip_address
-            ud = vm.instance_view.platform_update_domain
-            fd = vm.instance_view.platform_fault_domain
-            if primary_ip is None:
-                primary_ip = remote_ip
-                primary_ud = ud
-                primary_fd = fd
-            if backup_ip is None:
-                if (primary_ip == backup_ip or primary_ud == ud or
-                        primary_fd == fd):
-                    continue
-                backup_ip = remote_ip
-                backup_ud = ud
-                backup_fd = fd
-        # second pass, fill in with at least non-overlapping update domains
-        if backup_ip is None:
-            for i in range(sc.vm_count):
-                vm, nic = vms[i]
-                remote_ip = nic.ip_configurations[0].private_ip_address
-                ud = vm.instance_view.platform_update_domain
-                fd = vm.instance_view.platform_fault_domain
-                if primary_ud != ud:
-                    backup_ip = remote_ip
-                    backup_ud = ud
-                    backup_fd = fd
-                    break
-        if primary_ip is None or backup_ip is None:
-            raise RuntimeError(
-                'Could not find either a primary ip {} or backup ip {} for '
-                'glusterfs client mount'.format(primary_ip, backup_ip))
-        logger.debug('primary ip/ud/fd={} backup ip/ud/fd={}'.format(
-            (primary_ip, primary_ud, primary_fd),
-            (backup_ip, backup_ud, backup_fd)))
-        # construct mount options
-        mo = '_netdev,noauto,transport=tcp,backupvolfile-server={}'.format(
-            backup_ip)
-        amo = settings.shared_data_volume_mount_options(sdv, sc_id)
-        if util.is_not_empty(amo):
-            if 'auto' in mo:
-                raise RuntimeError(
-                    ('auto cannot be specified as a mount option for '
-                     'storage cluster {}').format(sc_id))
-            if any([x.startswith('backupvolfile-server=') for x in amo]):
-                raise RuntimeError(
-                    ('backupvolfile-server cannot be specified as a mount '
-                     'option for storage cluster {}').format(sc_id))
-            if any([x.startswith('transport=') for x in amo]):
-                raise RuntimeError(
-                    ('transport cannot be specified as a mount option for '
-                     'storage cluster {}').format(sc_id))
-            mo = ','.join((mo, ','.join(amo)))
-        # construct mount string for fstab, srcpath is the gluster volume
-        fstab_mount = (
-            '{remoteip}:/{srcpath} {hmp}/{scid} '
-            '{fstype} {mo} 0 2').format(
-                remoteip=primary_ip,
-                srcpath=settings.get_file_server_glusterfs_volume_name(sc),
-                hmp=settings.get_host_mounts_path(False),
-                scid=sc_id,
-                fstype=sc.file_server.type,
-                mo=mo)
-    else:
-        raise NotImplementedError(
-            ('cannot handle file_server type {} for storage '
-             'cluster {}').format(sc.file_server.type, sc_id))
-    if util.is_none_or_empty(fstab_mount):
-        raise RuntimeError(
-            ('Could not construct an fstab mount entry for storage '
-             'cluster {}').format(sc_id))
-    # construct sc_arg
-    sc_arg = '{}:{}'.format(sc.file_server.type, sc_id)
-    # log config
-    if settings.verbose(config):
-        logger.debug('storage cluster {} fstab mount: {}'.format(
-            sc_id, fstab_mount))
-    return (fstab_mount, sc_arg)
+    # construct host mount path
+    host_mount_path = '{}/{}'.format(
+        settings.get_host_mounts_path(False), sc_id)
+    # return fstab and sc arg
+    return remotefs.create_storage_cluster_mount_args(
+        compute_client, network_client, config, sc_id, host_mount_path)


 def _create_custom_linux_mount_args(config, mount_name):
@ -1054,28 +940,6 @@ def _pick_node_agent_for_vm(batch_client, config, pool_settings):
    return (image_ref_to_use, sku_to_use.id)


-def _explode_arm_subnet_id(arm_subnet_id):
-    # type: (str) -> Tuple[str, str, str, str, str]
-    """Parses components from ARM subnet id
-    :param str arm_subnet_id: ARM subnet id
-    :rtype: tuple
-    :return: subid, rg, provider, vnet, subnet
-    """
-    tmp = arm_subnet_id.split('/')
-    try:
-        subid = tmp[2]
-        rg = tmp[4]
-        provider = tmp[6]
-        vnet = tmp[8]
-        subnet = tmp[10]
-    except IndexError:
-        raise ValueError(
-            'Error parsing arm_subnet_id. Make sure the virtual network '
-            'resource id is correct and is postfixed with the '
-            '/subnets/<subnet_id> portion.')
-    return subid, rg, provider, vnet, subnet
-
-
 def _check_for_batch_aad(bc, rmsg):
    # type: (settings.BatchCredentialSettings, str) -> None
    """Check for Batch AAD
@ -1116,7 +980,7 @@ def _pool_virtual_network_subnet_address_space_check(
    # get subnet object
    subnet_id = None
    if util.is_not_empty(pool_settings.virtual_network.arm_subnet_id):
-        subnet_components = _explode_arm_subnet_id(
+        subnet_components = util.explode_arm_subnet_id(
            pool_settings.virtual_network.arm_subnet_id)
        logger.debug(
            ('arm subnet id breakdown: subid={} rg={} provider={} vnet={} '
@ -3154,7 +3018,7 @@ def action_fs_cluster_ssh(
            'was specified')
        cardinal = 0
    if cardinal is not None and cardinal < 0:
-            raise ValueError('invalid cardinal option value')
+        raise ValueError('invalid cardinal option value')
    remotefs.ssh_storage_cluster(
        compute_client, network_client, config, storage_cluster_id,
        cardinal, hostname, tty, command)
@ -3587,8 +3451,11 @@ def action_pool_user_del(batch_client, config):
        batch.del_ssh_user(batch_client, config)


-def action_pool_ssh(batch_client, config, cardinal, nodeid, tty, command):
-    # type: (batchsc.BatchServiceClient, dict, int, str, bool, tuple) -> None
+def action_pool_ssh(
+        batch_client, config, cardinal, nodeid, tty, command,
+        ssh_username=None, ssh_private_key=None):
+    # type: (batchsc.BatchServiceClient, dict, int, str, bool, tuple, str,
+    #        str) -> None
    """Action: Pool Ssh
    :param azure.batch.batch_service_client.BatchServiceClient batch_client:
        batch client
@ -3597,6 +3464,8 @@ def action_pool_ssh(batch_client, config, cardinal, nodeid, tty, command):
    :param str nodeid: node id
    :param bool tty: allocate pseudo-tty
    :param tuple command: command to execute
+    :param str ssh_username: ssh username
+    :param pathlib.Path ssh_private_key: ssh private key
    """
    _check_batch_client(batch_client)
    if cardinal is not None and util.is_not_empty(nodeid):
@ -3609,14 +3478,18 @@ def action_pool_ssh(batch_client, config, cardinal, nodeid, tty, command):
    if cardinal is not None and cardinal < 0:
        raise ValueError('invalid cardinal option value')
    pool = settings.pool_settings(config)
-    ssh_private_key = pool.ssh.ssh_private_key
    if ssh_private_key is None:
-        ssh_private_key = pathlib.Path(
-            pool.ssh.generated_file_export_path, crypto.get_ssh_key_prefix())
+        ssh_private_key = pool.ssh.ssh_private_key
+        if ssh_private_key is None:
+            ssh_private_key = pathlib.Path(
+                pool.ssh.generated_file_export_path,
+                crypto.get_ssh_key_prefix())
+    if util.is_none_or_empty(ssh_username):
+        ssh_username = pool.ssh.username
    ip, port = batch.get_remote_login_setting_for_node(
        batch_client, config, cardinal, nodeid)
    crypto.connect_or_exec_ssh_command(
-        ip, port, ssh_private_key, pool.ssh.username, tty=tty,
+        ip, port, ssh_private_key, ssh_username, tty=tty,
        command=command)


@ -3639,7 +3512,7 @@ def action_pool_rdp(batch_client, config, cardinal, nodeid, no_auto=False):
            'was specified')
        cardinal = 0
    if cardinal is not None and cardinal < 0:
-            raise ValueError('invalid cardinal option value')
+        raise ValueError('invalid cardinal option value')
    pool = settings.pool_settings(config)
    ip, port = batch.get_remote_login_setting_for_node(
        batch_client, config, cardinal, nodeid)
@ -5051,3 +4924,167 @@ def action_fed_jobs_zap(blob_client, config, federation_id, unique_id):
        return
    storage.zap_unique_id_from_federation(
        blob_client, config, federation_id, unique_id)
+
+
+def action_slurm_ssh(
+        compute_client, network_client, table_client, batch_client, config,
+        tty, command, kind, offset, node_name):
+    # type: (azure.mgmt.compute.ComputeManagementClient,
+    #        azure.mgmt.network.NetworkManagementClient, dict,
+    #        bool, tuple, str, int, str) -> None
+    """Action: Slurm Ssh Controller
+    :param azure.mgmt.compute.ComputeManagementClient compute_client:
+        compute client
+    :param azure.mgmt.network.NetworkManagementClient network_client:
+        network client
+    :param dict config: configuration dict
+    :param bool tty: allocate pseudo-tty
+    :param tuple command: command
+    :param str kind: kind
+    :param int offset: offset
+    :param str node_name: node name
+    """
+    if util.is_none_or_empty(node_name):
+        _check_compute_client(compute_client)
+        _check_network_client(network_client)
+        vm_res = settings.slurm_settings(config, kind)
+        if offset is None:
+            offset = 0
+        else:
+            offset = int(offset)
+        if kind == 'login':
+            cont_vm_count = settings.slurm_vm_count(config, 'controller')
+            offset = cont_vm_count + offset
+        resource.ssh_to_virtual_machine_resource(
+            compute_client, network_client, vm_res,
+            crypto.get_slurm_ssh_key_prefix(kind), tty, command, offset=offset)
+    else:
+        slurm_opts = settings.slurm_options_settings(config)
+        # get host name to node id mapping
+        node_id = storage.get_slurm_host_node_id(
+            table_client, slurm_opts.cluster_id, node_name)
+        if util.is_none_or_empty(node_id):
+            raise RuntimeError(
+                'No batch node id associated with Slurm node: {}'.format(
+                    node_name))
+        ss_login = settings.slurm_settings(config, 'login')
+        ssh_private_key = ss_login.ssh.ssh_private_key
+        if ssh_private_key is None:
+            ssh_private_key = pathlib.Path(
+                ss_login.ssh.generated_file_export_path,
+                crypto.get_slurm_ssh_key_prefix('login'))
+        action_pool_ssh(
+            batch_client, config, None, node_id, tty, command,
+            ssh_username=ss_login.ssh.username,
+            ssh_private_key=ssh_private_key)
+
+
+def action_slurm_cluster_create(
+        auth_client, resource_client, compute_client, network_client,
+        blob_client, table_client, queue_client, batch_client, config):
+    # type: (azure.mgmt.authorization.AuthorizationManagementClient,
+    #        azure.mgmt.resource.resources.ResourceManagementClient,
+    #        azure.mgmt.compute.ComputeManagementClient,
+    #        azure.mgmt.network.NetworkManagementClient,
+    #        azure.storage.blob.BlockBlobService,
+    #        azure.cosmosdb.table.TableService,
+    #        azure.batch.batch_service_client.BatchServiceClient, dict) -> None
+    """Action: Slurm Cluster Create
+    :param azure.mgmt.authorization.AuthorizationManagementClient auth_client:
+        auth client
+    :param azure.mgmt.resource.resources.ResourceManagementClient
+        resource_client: resource client
+    :param azure.mgmt.compute.ComputeManagementClient compute_client:
+        compute client
+    :param azure.mgmt.network.NetworkManagementClient network_client:
+        network client
+    :param azure.storage.blob.BlockBlobService blob_client: blob client
+    :param azure.cosmosdb.table.TableService table_client: table client
+    :param azure.batch.batch_service_client.BatchServiceClient batch_client:
+        batch client
+    :param dict config: configuration dict
+    """
+    _check_resource_client(resource_client)
+    _check_compute_client(compute_client)
+    _check_network_client(network_client)
+    _check_batch_client(batch_client)
+    # ensure aad creds are populated
+    mgmt_aad = settings.credentials_management(config)
+    if (util.is_none_or_empty(mgmt_aad.subscription_id) or
+            util.is_none_or_empty(mgmt_aad.aad.authority_url)):
+        raise ValueError('management aad credentials are invalid')
+    slurm.create_slurm_controller(
+        auth_client, resource_client, compute_client, network_client,
+        blob_client, table_client, queue_client, batch_client, config,
+        _RESOURCES_PATH, _SLURMMASTERPREP_FILE, _SLURMCOMPUTENODEPREP_FILE,
+        _SLURMPY_FILE, _SLURMREQ_FILE, _CONFIGURABLE_SLURM_FILES)
+
+
+def action_slurm_cluster_status(compute_client, network_client, config):
+    # type: (azure.mgmt.compute.ComputeManagementClient,
+    #        azure.mgmt.network.NetworkManagementClient, dict) -> None
+    """Action: Slurm Cluster Status
+    :param azure.mgmt.compute.ComputeManagementClient compute_client:
+        compute client
+    :param azure.mgmt.network.NetworkManagementClient network_client:
+        network client
+    :param dict config: configuration dict
+    """
+    _check_compute_client(compute_client)
+    vm_res = settings.slurm_settings(config, 'controller')
+    cont_vm_count = settings.slurm_vm_count(config, 'controller')
+    i = 0
+    while i < cont_vm_count:
+        resource.stat_virtual_machine_resource(
+            compute_client, network_client, config, vm_res, offset=i)
+        i += 1
+    vm_res = settings.slurm_settings(config, 'login')
+    login_vm_count = settings.slurm_vm_count(config, 'login')
+    i = 0
+    while i < login_vm_count:
+        resource.stat_virtual_machine_resource(
+            compute_client, network_client, config, vm_res,
+            offset=cont_vm_count + i)
+        i += 1
+
+
+def action_slurm_cluster_destroy(
+        resource_client, compute_client, network_client, blob_client,
+        table_client, queue_client, config, delete_all_resources,
+        delete_virtual_network, generate_from_prefix, wait):
+    # type: (azure.mgmt.resource.resources.ResourceManagementClient,
+    #        azure.mgmt.compute.ComputeManagementClient,
+    #        azure.mgmt.network.NetworkManagementClient,
+    #        azure.storage.blob.BlockBlobService,
+    #        azure.cosmosdb.table.TableService,
+    #        azure.storage.queue.QueueService, dict, bool, bool,
+    #        bool, bool) -> None
+    """Action: Slurm Cluster Destroy
+    :param azure.mgmt.resource.resources.ResourceManagementClient
+        resource_client: resource client
+    :param azure.mgmt.compute.ComputeManagementClient compute_client:
+        compute client
+    :param azure.mgmt.network.NetworkManagementClient network_client:
+        network client
+    :param azure.storage.blob.BlockBlobService blob_client: blob client
+    :param azure.cosmosdb.table.TableService table_client: table client
+    :param azure.storage.queue.QueueService queue_client: queue client
+    :param dict config: configuration dict
+    :param bool delete_all_resources: delete all resources
+    :param bool delete_virtual_network: delete virtual network
+    :param bool generate_from_prefix: generate resources from hostname prefix
+    :param bool wait: wait for deletion to complete
+    """
+    _check_resource_client(resource_client)
+    _check_compute_client(compute_client)
+    _check_network_client(network_client)
+    if (generate_from_prefix and
+            (delete_all_resources or delete_virtual_network)):
+        raise ValueError(
+            'Cannot specify generate_from_prefix and a delete_* option')
+    slurm.delete_slurm_controller(
+        resource_client, compute_client, network_client, blob_client,
+        table_client, queue_client, config,
+        delete_virtual_network=delete_virtual_network,
+        delete_resource_group=delete_all_resources,
+        generate_from_prefix=generate_from_prefix, wait=wait)
--- a/convoy/monitor.py
+++ b/convoy/monitor.py
@ -423,21 +423,11 @@ def create_monitoring_resource(
            async_ops['port80'] = resource.AsyncOperation(functools.partial(
                resource.add_inbound_network_security_rule, network_client, ms,
                'acme80', isr))
-    # install msi vm extension
-    async_ops['vmext'] = {}
-    async_ops['vmext'][0] = resource.AsyncOperation(
-        functools.partial(
-            resource.create_msi_virtual_machine_extension, compute_client, ms,
-            vms[0].name, 0, settings.verbose(config)),
-        max_retries=0,
-    )
-    logger.debug('waiting for virtual machine msi extensions to provision')
-    for offset in async_ops['vmext']:
-        async_ops['vmext'][offset].result()
    # ensure port 80 rule is ready
    if servconf.lets_encrypt_enabled and ms.public_ip.enabled:
        async_ops['port80'].result()
    # install vm extension
+    async_ops['vmext'] = {}
    async_ops['vmext'][0] = resource.AsyncOperation(
        functools.partial(
            _create_virtual_machine_extension, compute_client, config, ms,
--- a/convoy/remotefs.py
+++ b/convoy/remotefs.py
@ -52,6 +52,180 @@ logger = logging.getLogger(__name__)
 util.setup_logger(logger)


+def create_storage_cluster_mount_args(
+        compute_client, network_client, config, sc_id, host_mount_path):
+    # type: (azure.mgmt.compute.ComputeManagementClient,
+    #        azure.mgmt.network.NetworkManagementClient,
+    #        dict, str, str) -> Tuple[str, str]
+    """Create storage cluster mount arguments
+    :param azure.mgmt.compute.ComputeManagementClient compute_client:
+        compute client
+    :param azure.mgmt.network.NetworkManagementClient network_client:
+        network client
+    :param dict config: configuration dict
+    :param str sc_id: storage cluster id
+    :param str host_mount_path: host mount path
+    :rtype: tuple
+    :return: (fstab mount, storage cluster arg)
+    """
+    fstab_mount = None
+    sc_arg = None
+    # get remotefs settings
+    rfs = settings.remotefs_settings(config, sc_id)
+    sc = rfs.storage_cluster
+    # iterate through shared data volumes and find storage clusters
+    sdv = settings.global_resources_shared_data_volumes(config)
+    if (sc_id not in sdv or
+            not settings.is_shared_data_volume_storage_cluster(
+                sdv, sc_id)):
+        raise RuntimeError(
+            'No storage cluster {} found in configuration'.format(sc_id))
+    # get vm count
+    if sc.vm_count < 1:
+        raise RuntimeError(
+            'storage cluster {} vm_count {} is invalid'.format(
+                sc_id, sc.vm_count))
+    # get fileserver type
+    if sc.file_server.type == 'nfs':
+        # query first vm for info
+        vm_name = settings.generate_virtual_machine_name(sc, 0)
+        vm = compute_client.virtual_machines.get(
+            resource_group_name=sc.resource_group,
+            vm_name=vm_name,
+        )
+        nic = resource.get_nic_from_virtual_machine(
+            network_client, sc.resource_group, vm)
+        # get private ip of vm
+        remote_ip = nic.ip_configurations[0].private_ip_address
+        # construct mount options
+        mo = '_netdev,noauto,nfsvers=4,intr'
+        amo = settings.shared_data_volume_mount_options(sdv, sc_id)
+        if util.is_not_empty(amo):
+            if 'udp' in mo:
+                raise RuntimeError(
+                    ('udp cannot be specified as a mount option for '
+                     'storage cluster {}').format(sc_id))
+            if 'auto' in mo:
+                raise RuntimeError(
+                    ('auto cannot be specified as a mount option for '
+                     'storage cluster {}').format(sc_id))
+            if any([x.startswith('nfsvers=') for x in amo]):
+                raise RuntimeError(
+                    ('nfsvers cannot be specified as a mount option for '
+                     'storage cluster {}').format(sc_id))
+            if any([x.startswith('port=') for x in amo]):
+                raise RuntimeError(
+                    ('port cannot be specified as a mount option for '
+                     'storage cluster {}').format(sc_id))
+            mo = ','.join((mo, ','.join(amo)))
+        # construct mount string for fstab
+        fstab_mount = (
+            '{remoteip}:{srcpath} {hmp} '
+            '{fstype} {mo} 0 0').format(
+                remoteip=remote_ip,
+                srcpath=sc.file_server.mountpoint,
+                hmp=host_mount_path,
+                fstype=sc.file_server.type,
+                mo=mo)
+    elif sc.file_server.type == 'glusterfs':
+        # walk vms and find non-overlapping ud/fds
+        primary_ip = None
+        primary_ud = None
+        primary_fd = None
+        backup_ip = None
+        backup_ud = None
+        backup_fd = None
+        vms = {}
+        # first pass, attempt to populate all ip, ud/fd
+        for i in range(sc.vm_count):
+            vm_name = settings.generate_virtual_machine_name(sc, i)
+            vm = compute_client.virtual_machines.get(
+                resource_group_name=sc.resource_group,
+                vm_name=vm_name,
+                expand=compute_client.virtual_machines.models.
+                InstanceViewTypes.instance_view,
+            )
+            nic = resource.get_nic_from_virtual_machine(
+                network_client, sc.resource_group, vm)
+            vms[i] = (vm, nic)
+            # get private ip and ud/fd of vm
+            remote_ip = nic.ip_configurations[0].private_ip_address
+            ud = vm.instance_view.platform_update_domain
+            fd = vm.instance_view.platform_fault_domain
+            if primary_ip is None:
+                primary_ip = remote_ip
+                primary_ud = ud
+                primary_fd = fd
+            if backup_ip is None:
+                if (primary_ip == backup_ip or primary_ud == ud or
+                        primary_fd == fd):
+                    continue
+                backup_ip = remote_ip
+                backup_ud = ud
+                backup_fd = fd
+        # second pass, fill in with at least non-overlapping update domains
+        if backup_ip is None:
+            for i in range(sc.vm_count):
+                vm, nic = vms[i]
+                remote_ip = nic.ip_configurations[0].private_ip_address
+                ud = vm.instance_view.platform_update_domain
+                fd = vm.instance_view.platform_fault_domain
+                if primary_ud != ud:
+                    backup_ip = remote_ip
+                    backup_ud = ud
+                    backup_fd = fd
+                    break
+        if primary_ip is None or backup_ip is None:
+            raise RuntimeError(
+                'Could not find either a primary ip {} or backup ip {} for '
+                'glusterfs client mount'.format(primary_ip, backup_ip))
+        logger.debug('primary ip/ud/fd={} backup ip/ud/fd={}'.format(
+            (primary_ip, primary_ud, primary_fd),
+            (backup_ip, backup_ud, backup_fd)))
+        # construct mount options
+        mo = '_netdev,noauto,transport=tcp,backupvolfile-server={}'.format(
+            backup_ip)
+        amo = settings.shared_data_volume_mount_options(sdv, sc_id)
+        if util.is_not_empty(amo):
+            if 'auto' in mo:
+                raise RuntimeError(
+                    ('auto cannot be specified as a mount option for '
+                     'storage cluster {}').format(sc_id))
+            if any([x.startswith('backupvolfile-server=') for x in amo]):
+                raise RuntimeError(
+                    ('backupvolfile-server cannot be specified as a mount '
+                     'option for storage cluster {}').format(sc_id))
+            if any([x.startswith('transport=') for x in amo]):
+                raise RuntimeError(
+                    ('transport cannot be specified as a mount option for '
+                     'storage cluster {}').format(sc_id))
+            mo = ','.join((mo, ','.join(amo)))
+        # construct mount string for fstab, srcpath is the gluster volume
+        fstab_mount = (
+            '{remoteip}:/{srcpath} {hmp} '
+            '{fstype} {mo} 0 0').format(
+                remoteip=primary_ip,
+                srcpath=settings.get_file_server_glusterfs_volume_name(sc),
+                hmp=host_mount_path,
+                fstype=sc.file_server.type,
+                mo=mo)
+    else:
+        raise NotImplementedError(
+            ('cannot handle file_server type {} for storage '
+             'cluster {}').format(sc.file_server.type, sc_id))
+    if util.is_none_or_empty(fstab_mount):
+        raise RuntimeError(
+            ('Could not construct an fstab mount entry for storage '
+             'cluster {}').format(sc_id))
+    # construct sc_arg
+    sc_arg = '{}:{}'.format(sc.file_server.type, sc_id)
+    # log config
+    if settings.verbose(config):
+        logger.debug('storage cluster {} fstab mount: {}'.format(
+            sc_id, fstab_mount))
+    return (fstab_mount, sc_arg)
+
+
 def _create_managed_disk(compute_client, rfs, disk_name):
    # type: (azure.mgmt.compute.ComputeManagementClient,
    #        settings.RemoteFsSettings, str) ->
@ -444,52 +618,6 @@ def _create_virtual_machine_extension(
    )


-def _create_availability_set(compute_client, rfs):
-    # type: (azure.mgmt.compute.ComputeManagementClient,
-    #        settings.RemoteFsSettings) ->
-    #        msrestazure.azure_operation.AzureOperationPoller
-    """Create an availability set
-    :param azure.mgmt.compute.ComputeManagementClient compute_client:
-        compute client
-    :param settings.RemoteFsSettings rfs: remote filesystem settings
-    :rtype: msrestazure.azure_operation.AzureOperationPoller or None
-    :return: msrestazure.azure_operation.AzureOperationPoller
-    """
-    if rfs.storage_cluster.vm_count <= 1:
-        logger.info('insufficient vm_count for availability set')
-        return None
-    if rfs.storage_cluster.zone is not None:
-        logger.info('cannot create an availability set for zonal resource')
-        return None
-    as_name = settings.generate_availability_set_name(rfs.storage_cluster)
-    # check and fail if as exists
-    try:
-        compute_client.availability_sets.get(
-            resource_group_name=rfs.storage_cluster.resource_group,
-            availability_set_name=as_name,
-        )
-        raise RuntimeError('availability set {} exists'.format(as_name))
-    except msrestazure.azure_exceptions.CloudError as e:
-        if e.status_code == 404:
-            pass
-        else:
-            raise
-    logger.debug('creating availability set: {}'.format(as_name))
-    return compute_client.availability_sets.create_or_update(
-        resource_group_name=rfs.storage_cluster.resource_group,
-        availability_set_name=as_name,
-        # user maximums ud, fd from settings due to region variability
-        parameters=compute_client.virtual_machines.models.AvailabilitySet(
-            location=rfs.storage_cluster.location,
-            platform_update_domain_count=20,
-            platform_fault_domain_count=rfs.storage_cluster.fault_domains,
-            sku=compute_client.virtual_machines.models.Sku(
-                name='Aligned',
-            ),
-        )
-    )
-
-
 def create_storage_cluster(
        resource_client, compute_client, network_client, blob_client, config,
        sc_id, bootstrap_file, remotefs_files):
@ -633,7 +761,9 @@ def create_storage_cluster(
            resource.create_network_interface, network_client,
            rfs.storage_cluster, subnet, nsg, private_ips, pips, i))
    # create availability set if vm_count > 1, this call is not async
-    availset = _create_availability_set(compute_client, rfs)
+    availset = resource.create_availability_set(
+        compute_client, rfs.storage_cluster, rfs.storage_cluster.vm_count,
+        fault_domains=rfs.storage_cluster.fault_domains)
    # wait for nics to be created
    logger.debug('waiting for network interfaces to provision')
    nics = {}
@ -1257,24 +1387,6 @@ def expand_storage_cluster(
    return succeeded


-def _delete_availability_set(compute_client, rg_name, as_name):
-    # type: (azure.mgmt.compute.ComputeManagementClient, str, str) ->
-    #        msrestazure.azure_operation.AzureOperationPoller
-    """Delete an availability set
-    :param azure.mgmt.compute.ComputeManagementClient compute_client:
-        compute client
-    :param str rg_name: resource group name
-    :param str as_name: availability set name
-    :rtype: msrestazure.azure_operation.AzureOperationPoller
-    :return: async op poller
-    """
-    logger.debug('deleting availability set {}'.format(as_name))
-    return compute_client.availability_sets.delete(
-        resource_group_name=rg_name,
-        availability_set_name=as_name,
-    )
-
-
 def delete_storage_cluster(
        resource_client, compute_client, network_client, blob_client, config,
        sc_id, delete_data_disks=False, delete_virtual_network=False,
@ -1522,7 +1634,7 @@ def delete_storage_cluster(
        if util.is_none_or_empty(as_name) or as_name in deleted:
            continue
        deleted.add(as_name)
-        _delete_availability_set(
+        resource.delete_availability_set(
            compute_client, rfs.storage_cluster.resource_group, as_name)
        logger.info('availability set {} deleted'.format(as_name))
    deleted.clear()
--- a/convoy/resource.py
+++ b/convoy/resource.py
@ -639,13 +639,19 @@ def create_network_interface(
        logger.debug('assigning public ip {} to network interface {}'.format(
            pip.name, nic_name))
    # create network ip config
-    if private_ips is None:
+    if private_ips is None or private_ips[offset] is None:
+        logger.debug(
+            'assigning private ip dynamically to network interface {}'.format(
+                nic_name))
        network_ip_config = networkmodels.NetworkInterfaceIPConfiguration(
            name=vm_resource.hostname_prefix,
            subnet=subnet,
            public_ip_address=pip,
        )
    else:
+        logger.debug(
+            'assigning private ip {} statically to network '
+            'interface {}'.format(private_ips[offset], nic_name))
        network_ip_config = networkmodels.NetworkInterfaceIPConfiguration(
            name=vm_resource.hostname_prefix,
            subnet=subnet,
@ -656,7 +662,8 @@ def create_network_interface(
            private_ip_address_version=networkmodels.IPVersion.ipv4,

        )
-    logger.debug('creating network interface: {}'.format(nic_name))
+    logger.debug('creating network interface: {} with nsg={}'.format(
+        nic_name, nsg.name if nsg else None))
    return network_client.network_interfaces.create_or_update(
        resource_group_name=vm_resource.resource_group,
        network_interface_name=nic_name,
@ -671,10 +678,10 @@ def create_network_interface(

 def create_virtual_machine(
        compute_client, vm_resource, availset, nics, disks, ssh_pub_key,
-        offset, enable_msi=False):
+        offset, enable_msi=False, tags=None):
    # type: (azure.mgmt.compute.ComputeManagementClient,
    #        settings.VmResource, computemodels.AvailabilitySet,
-    #        dict, dict, computemodels.SshPublicKey, int, bool) ->
+    #        dict, dict, computemodels.SshPublicKey, int, bool, dict) ->
    #        Tuple[int, msrestazure.azure_operation.AzureOperationPoller]
    """Create a virtual machine
    :param azure.mgmt.compute.ComputeManagementClient compute_client:
@ -686,6 +693,7 @@ def create_virtual_machine(
    :param computemodels.SshPublicKey ssh_pub_key: SSH public key
    :param int offset: vm number
    :param bool enable_msi: enable system MSI
+    :param dict tags: tags for VM
    :rtype: tuple
    :return: (offset int, msrestazure.azure_operation.AzureOperationPoller)
    """
@ -784,43 +792,81 @@ def create_virtual_machine(
            ),
            identity=identity,
            zones=zone,
+            tags=tags,
        ),
    )


-def create_msi_virtual_machine_extension(
-        compute_client, vm_resource, vm_name, offset, verbose=False):
+def create_availability_set(
+        compute_client, vm_resource, vm_count, update_domains=None,
+        fault_domains=None):
    # type: (azure.mgmt.compute.ComputeManagementClient,
-    #        settings.VmResource, str, int,
-    #        bool) -> msrestazure.azure_operation.AzureOperationPoller
-    """Create a virtual machine extension
+    #        settings.VmResource, int, Optional[int], Optional[int]) ->
+    #        msrestazure.azure_operation.AzureOperationPoller
+    """Create an availability set
    :param azure.mgmt.compute.ComputeManagementClient compute_client:
        compute client
-    :param settings.VmResource vm_resource: VM resource
-    :param str vm_name: vm name
-    :param int offset: vm number
-    :param bool verbose: verbose logging
-    :rtype: msrestazure.azure_operation.AzureOperationPoller
+    :param settings.VmResource vm_resource: VM Resource
+    :param int vm_count: VM count
+    :param int update_domains: update domains
+    :param int fault_domains: fault domains
+    :rtype: msrestazure.azure_operation.AzureOperationPoller or None
    :return: msrestazure.azure_operation.AzureOperationPoller
    """
-    vm_ext_name = settings.generate_virtual_machine_msi_extension_name(
-        vm_resource, offset)
-    logger.debug('creating virtual machine extension: {}'.format(vm_ext_name))
-    return compute_client.virtual_machine_extensions.create_or_update(
+    if vm_count <= 1:
+        logger.info('insufficient vm_count for availability set')
+        return None
+    if vm_resource.zone is not None:
+        logger.info('cannot create an availability set for zonal resource')
+        return None
+    as_name = settings.generate_availability_set_name(vm_resource)
+    # check and fail if as exists
+    try:
+        compute_client.availability_sets.get(
+            resource_group_name=vm_resource.resource_group,
+            availability_set_name=as_name,
+        )
+        raise RuntimeError('availability set {} exists'.format(as_name))
+    except msrestazure.azure_exceptions.CloudError as e:
+        if e.status_code == 404:
+            pass
+        else:
+            raise
+    logger.debug('creating availability set: {}'.format(as_name))
+    if update_domains is None:
+        update_domains = 20
+    if fault_domains is None:
+        fault_domains = 2
+    return compute_client.availability_sets.create_or_update(
        resource_group_name=vm_resource.resource_group,
-        vm_name=vm_name,
-        vm_extension_name=vm_ext_name,
-        extension_parameters=compute_client.virtual_machine_extensions.models.
-        VirtualMachineExtension(
+        availability_set_name=as_name,
+        # user maximums ud, fd from settings due to region variability
+        parameters=compute_client.virtual_machines.models.AvailabilitySet(
            location=vm_resource.location,
-            publisher='Microsoft.ManagedIdentity',
-            virtual_machine_extension_type='ManagedIdentityExtensionForLinux',
-            type_handler_version='1.0',
-            auto_upgrade_minor_version=True,
-            settings={
-                'port': 50342,
-            },
-        ),
+            platform_update_domain_count=update_domains,
+            platform_fault_domain_count=fault_domains,
+            sku=compute_client.virtual_machines.models.Sku(
+                name='Aligned',
+            ),
+        )
+    )
+
+
+def delete_availability_set(compute_client, rg_name, as_name):
+    # type: (azure.mgmt.compute.ComputeManagementClient, str, str) ->
+    #        msrestazure.azure_operation.AzureOperationPoller
+    """Delete an availability set
+    :param azure.mgmt.compute.ComputeManagementClient compute_client:
+        compute client
+    :param str rg_name: resource group name
+    :param str as_name: availability set name
+    :rtype: msrestazure.azure_operation.AzureOperationPoller
+    :return: async op poller
+    """
+    logger.debug('deleting availability set {}'.format(as_name))
+    return compute_client.availability_sets.delete(
+        resource_group_name=rg_name,
+        availability_set_name=as_name,
    )


@ -955,11 +1001,11 @@ def deallocate_virtual_machine(compute_client, rg_name, vm_name):

 def get_ssh_info(
        compute_client, network_client, vm_res, ssh_key_prefix=None, nic=None,
-        pip=None):
+        pip=None, offset=0):
    # type: (azure.mgmt.compute.ComputeManagementClient,
    #        azure.mgmt.network.NetworkManagementClient,
    #        settings.VmResource, str, networkmodes.NetworkInterface,
-    #        networkmodels.PublicIPAddress) ->
+    #        networkmodels.PublicIPAddress, int) ->
    #        Tuple[pathlib.Path, int, str, str]
    """Get SSH info to a federation proxy
    :param azure.mgmt.compute.ComputeManagementClient compute_client:
@ -970,10 +1016,11 @@ def get_ssh_info(
    :param str ssh_key_prefix: ssh key prefix
    :param networkmodels.NetworkInterface nic: network interface
    :param networkmodels.PublicIPAddress pip: public ip
+    :param int offset: offset
    :rtype: tuple
    :return (ssh private key, port, username, ip)
    """
-    vm_name = settings.generate_virtual_machine_name(vm_res, 0)
+    vm_name = settings.generate_virtual_machine_name(vm_res, offset)
    try:
        vm = compute_client.virtual_machines.get(
            resource_group_name=vm_res.resource_group,
@ -1009,10 +1056,11 @@ def get_ssh_info(


 def ssh_to_virtual_machine_resource(
-        compute_client, network_client, vm_res, ssh_key_prefix, tty, command):
+        compute_client, network_client, vm_res, ssh_key_prefix, tty, command,
+        offset=0):
    # type: (azure.mgmt.compute.ComputeManagementClient,
    #        azure.mgmt.network.NetworkManagementClient,
-    #        settings.VmResource, str, bool, tuple) -> None
+    #        settings.VmResource, str, bool, tuple, int) -> None
    """SSH to a node
    :param azure.mgmt.compute.ComputeManagementClient compute_client:
        compute client
@ -1022,9 +1070,11 @@ def ssh_to_virtual_machine_resource(
    :param str ssh_key_prefix: ssh key prefix
    :param bool tty: allocate pseudo-tty
    :param tuple command: command to execute
+    :param int offset: offset
    """
    ssh_priv_key, port, username, ip = get_ssh_info(
-        compute_client, network_client, vm_res, ssh_key_prefix=ssh_key_prefix)
+        compute_client, network_client, vm_res, ssh_key_prefix=ssh_key_prefix,
+        offset=offset)
    crypto.connect_or_exec_ssh_command(
        ip, port, ssh_priv_key, username, tty=tty, command=command)

@ -1123,10 +1173,10 @@ def start_virtual_machine_resource(


 def stat_virtual_machine_resource(
-        compute_client, network_client, config, vm_res):
+        compute_client, network_client, config, vm_res, offset=0):
    # type: (azure.mgmt.compute.ComputeManagementClient,
    #        azure.mgmt.network.NetworkManagementClient, dict,
-    #        settings.VmResource) -> None
+    #        settings.VmResource, int) -> None
    """Retrieve status of a virtual machine resource
    :param azure.mgmt.compute.ComputeManagementClient compute_client:
        compute client
@ -1134,9 +1184,10 @@ def stat_virtual_machine_resource(
        network client
    :param dict config: configuration dict
    :param settings.VmResource vm_res: resource
+    :param int offset: offset
    """
    # retrieve all vms
-    vm_name = settings.generate_virtual_machine_name(vm_res, 0)
+    vm_name = settings.generate_virtual_machine_name(vm_res, offset)
    try:
        vm = compute_client.virtual_machines.get(
            resource_group_name=vm_res.resource_group,
--- a/convoy/settings.py
+++ b/convoy/settings.py
@ -72,8 +72,8 @@ _GPU_VISUALIZATION_INSTANCES = re.compile(
    re.IGNORECASE
 )
 _RDMA_INSTANCES = re.compile(
-    # standard a8/a9, h+r, nc+r, nd+r
-    r'^standard_((a8|a9)|((h|nc|nd)+[\d]+m?rs?(_v[\d])?))$',
+    # standard a8/a9, h+r, nc+r, nd+r, hb/hc
+    r'^standard_((a8|a9)|((h|hb|hc|nc|nd)+[\d]+m?rs?(_v[\d])?))$',
    re.IGNORECASE
 )
 _PREMIUM_STORAGE_INSTANCES = re.compile(
@ -110,6 +110,26 @@ _VM_TCP_NO_TUNE = frozenset((
    'standard_b1s', 'standard_b1ms', 'standard_b2s', 'standard_b2ms',
    'standard_b4ms', 'standard_b8ms',
 ))
+_VM_GPU_COUNT = {
+    1: re.compile(r'^standard_n[cdv]6r?s?(_v[\d])?$', re.IGNORECASE),
+    2: re.compile(r'^standard_n[cdv]12r?s?(_v[\d])?$', re.IGNORECASE),
+    4: re.compile(r'^standard_n[cdv]24r?s?(_v[\d])?$', re.IGNORECASE),
+    8: re.compile(r'^standard_nd40s_v2$', re.IGNORECASE),
+}
+_VM_GPU_CLASS = {
+    'tesla_k80': re.compile(r'^standard_n[c][\d]+r?$', re.IGNORECASE),
+    'tesla_p40': re.compile(r'^standard_n[d][\d]+r?s?$', re.IGNORECASE),
+    'tesla_p100': re.compile(r'^standard_n[c][\d]+r?s_v2$', re.IGNORECASE),
+    'tesla_v100': re.compile(
+        r'^standard_n(([c][\d]+r?s_v3)|(d40s_v2))$', re.IGNORECASE),
+    'tesla_m60': re.compile(r'^standard_nv[\d]+s?(_v2)?$', re.IGNORECASE),
+}
+_VM_IB_CLASS = {
+    'qdr_ib': re.compile(r'^standard_(a8|a9)$', re.IGNORECASE),
+    'fdr_ib': re.compile(
+        r'^standard_(((h|nc|nd)+[\d]+m?rs?(_v[\d])?))$', re.IGNORECASE),
+    'edr_ib': re.compile(r'^standard_(hc|hb)+[\d]+rs$', re.IGNORECASE),
+}
 _SINGULARITY_COMMANDS = frozenset(('exec', 'run'))
 _FORBIDDEN_MERGE_TASK_PROPERTIES = frozenset((
    'depends_on', 'depends_on_range', 'multi_instance', 'task_factory'
@ -447,6 +467,38 @@ FederationProxyOptionsSettings = collections.namedtuple(
        'scheduling_after_success_evaluate_autoscale',
    ]
 )
+SlurmBatchPoolSettings = collections.namedtuple(
+    'SlurmBatchPoolSettings', [
+        'batch_service_url', 'compute_node_type', 'max_compute_nodes',
+        'weight', 'features', 'reclaim_exclude_num_nodes',
+    ]
+)
+SlurmPartitionSettings = collections.namedtuple(
+    'SlurmPartitionSettings', [
+        'batch_pools', 'max_runtime_limit', 'default',
+    ]
+)
+SlurmUnmanagedPartitionSettings = collections.namedtuple(
+    'SlurmUnmanagedPartitionSettings', [
+        'partition', 'nodes',
+    ]
+)
+SlurmOptionsSettings = collections.namedtuple(
+    'SlurmOptionsSettings', [
+        'cluster_id', 'idle_reclaim_time', 'max_nodes', 'elastic_partitions',
+        'unmanaged_partitions',
+    ]
+)
+SlurmSharedDataVolumesSettings = collections.namedtuple(
+    'SlurmSharedDataVolumesSettings', [
+        'id', 'host_mount_path', 'store_slurmctld_state',
+    ]
+)
+SlurmCredentialsSettings = collections.namedtuple(
+    'SlurmCredentialsSettings', [
+        'db_password',
+    ]
+)


 class VmResource(object):
@ -664,6 +716,34 @@ def get_gpu_type_from_vm_size(vm_size):
        return None


+def get_num_gpus_from_vm_size(vm_size):
+    # type: (str) -> int
+    """Get number of GPUs from VM size
+    :param str vm_size: vm size
+    :rtype: int
+    :return: number of GPUs
+    """
+    for vm in _VM_GPU_COUNT:
+        if _VM_GPU_COUNT[vm].match(vm_size):
+            return vm
+    raise RuntimeError('vm_size {} has no mapping to number of GPUs'.format(
+        vm_size))
+
+
+def get_gpu_class_from_vm_size(vm_size):
+    # type: (str) -> str
+    """Get GPU class from VM size
+    :param str vm_size: vm size
+    :rtype: str
+    :return: GPU class
+    """
+    for c in _VM_GPU_CLASS:
+        if _VM_GPU_CLASS[c].match(vm_size):
+            return c
+    raise RuntimeError('vm_size {} has no mapping to GPU class'.format(
+        vm_size))
+
+
 def gpu_configuration_check(config, vm_size=None):
    # type: (dict, str) -> bool
    """Check if OS is allowed with a GPU VM
@ -760,6 +840,20 @@ def is_rdma_pool(vm_size):
    return _RDMA_INSTANCES.match(vm_size) is not None


+def get_ib_class_from_vm_size(vm_size):
+    # type: (str) -> str
+    """Get IB class from VM size
+    :param str vm_size: vm size
+    :rtype: str
+    :return: IB class
+    """
+    for c in _VM_IB_CLASS:
+        if _VM_IB_CLASS[c].match(vm_size):
+            return c
+    raise RuntimeError('vm_size {} has no mapping to IB class'.format(
+        vm_size))
+
+
 def is_premium_storage_vm_size(vm_size):
    # type: (str) -> bool
    """Check if vm size is premium storage compatible
@ -1932,6 +2026,23 @@ def set_credentials_registry_password(config, link, is_docker, password):
    config['credentials'][kind][link]['password'] = password


+def credentials_slurm(config):
+    # type: (dict) -> SlurmCredentialsSettings
+    """Get slurm settings
+    :param dict config: configuration object
+    :rtype: SlurmCredentialsSettings
+    :return: Slurm settings
+    """
+    try:
+        creds = config['credentials']
+    except (KeyError, TypeError):
+        creds = {}
+    conf = _kv_read_checked(creds, 'slurm', default={})
+    return SlurmCredentialsSettings(
+        db_password=_kv_read_checked(conf, 'db_password'),
+    )
+
+
 # GLOBAL SETTINGS
 def batch_shipyard_settings(config):
    # type: (dict) -> BatchShipyardSettings
@ -4896,23 +5007,274 @@ def federation_settings(config):
    )


-def federation_storage_account_settings(config):
-    # type: (dict) ->str
-    """Get federation storage account settings selector
+def slurm_options_settings(config):
+    # type: (dict) -> SlurmOptionsSettings
+    """Get slurm options settings
    :param dict config: configuration dict
-    :rtype: str
-    :return: federation storage settings link
+    :rtype: SlurmOptionsSettings
+    :return: slurm options settings
    """
    try:
-        conf = config['federation']
+        conf = config['slurm']['slurm_options']
+    except KeyError:
+        conf = {}
+    cluster_id = config['slurm']['cluster_id']
+    if util.is_none_or_empty(cluster_id) or len(cluster_id) > 22:
+        raise ValueError(
+            'cluster_id is invalid. Must be between 1 and 22 '
+            'characters in length')
+    bc = credentials_batch(config)
+    idle_reclaim_time = _kv_read(conf, 'idle_reclaim_time', default='00:15:00')
+    idle_reclaim_time = util.convert_string_to_timedelta(idle_reclaim_time)
+    if idle_reclaim_time.total_seconds == 0:
+        raise ValueError('idle_reclaim_time must be positive')
+    max_nodes = 0
+    partitions = {}
+    part_conf = _kv_read_checked(conf, 'elastic_partitions')
+    for key in part_conf:
+        part = _kv_read_checked(part_conf, key)
+        batch_pools = {}
+        pool_conf = _kv_read_checked(part, 'batch_pools', default={})
+        for pkey in pool_conf:
+            bpool = _kv_read_checked(pool_conf, pkey)
+            batch_service_url = _kv_read_checked(bpool, 'account_service_url')
+            if util.is_none_or_empty(batch_service_url):
+                batch_service_url = bc.account_service_url
+            max_compute_nodes = _kv_read(bpool, 'max_compute_nodes')
+            reclaim_exclude_num_nodes = _kv_read(
+                bpool, 'reclaim_exclude_num_nodes', default=0)
+            if reclaim_exclude_num_nodes > max_compute_nodes:
+                raise ValueError(
+                    'reclaim_exclude_num_nodes {} > '
+                    'max_compute_nodes {}'.format(
+                        reclaim_exclude_num_nodes, max_compute_nodes))
+            batch_pools[pkey] = SlurmBatchPoolSettings(
+                batch_service_url=batch_service_url,
+                compute_node_type=_kv_read_checked(bpool, 'compute_node_type'),
+                max_compute_nodes=max_compute_nodes,
+                weight=_kv_read(bpool, 'weight'),
+                features=_kv_read_checked(bpool, 'features', default=[]),
+                reclaim_exclude_num_nodes=reclaim_exclude_num_nodes,
+            )
+            max_nodes = max(max_nodes, batch_pools[pkey].max_compute_nodes)
+        max_runtime_limit = _kv_read_checked(part, 'max_runtime_limit')
+        if util.is_not_empty(max_runtime_limit):
+            max_runtime_limit = max_runtime_limit.replace('.', '-')
+        else:
+            max_runtime_limit = 'UNLIMITED'
+        partition = SlurmPartitionSettings(
+            batch_pools=batch_pools,
+            max_runtime_limit=max_runtime_limit,
+            default=_kv_read(part, 'default'),
+        )
+        partitions[key] = partition
+    unmanaged_partitions = []
+    upart_conf = _kv_read_checked(conf, 'unmanaged_partitions', default=[])
+    for upart in upart_conf:
+        unmanaged_partitions.append(SlurmUnmanagedPartitionSettings(
+            partition=_kv_read_checked(upart, 'partition'),
+            nodes=_kv_read_checked(upart, 'nodes'),
+        ))
+    return SlurmOptionsSettings(
+        cluster_id=cluster_id,
+        idle_reclaim_time=idle_reclaim_time,
+        max_nodes=max_nodes,
+        elastic_partitions=partitions,
+        unmanaged_partitions=unmanaged_partitions,
+    )
+
+
+def slurm_settings(config, kind):
+    # type: (dict) -> VmResource
+    """Get slurm settings
+    :param dict config: configuration dict
+    :rtype: VmResource
+    :return: VM resource settings
+    """
+    # general settings
+    try:
+        conf = config['slurm']
        if util.is_none_or_empty(conf):
            raise KeyError
    except KeyError:
-        raise ValueError('federation settings are invalid or missing')
+        raise ValueError('slurm settings are invalid or missing')
+    location = conf['location']
+    if util.is_none_or_empty(location):
+        raise ValueError('invalid location in slurm')
+    rg = _kv_read_checked(conf, 'resource_group')
+    if util.is_none_or_empty(rg):
+        raise ValueError('invalid resource_group in slurm')
+    zone = _kv_read(conf, 'zone')
+    hostname_prefix = '{}-{}'.format(
+        _kv_read_checked(conf, 'cluster_id'),
+        # Azure doesn't like "login" for DNS
+        'gateway' if kind == 'login' else kind
+    )
+    # get controller settings
+    try:
+        conf = conf[kind]
+        if util.is_none_or_empty(conf):
+            raise KeyError
+    except KeyError:
+        raise ValueError(
+            'slurm:{} settings are invalid or missing'.format(kind))
+    # vm settings
+    vm_size = _kv_read_checked(conf, 'vm_size')
+    accel_net = _kv_read(conf, 'accelerated_networking', False)
+    # public ip settings
+    pip_conf = _kv_read_checked(conf, 'public_ip', {})
+    pip_enabled = _kv_read(pip_conf, 'enabled', True)
+    pip_static = _kv_read(pip_conf, 'static', False)
+    # sc network security settings
+    ns_conf = conf['network_security']
+    ns_inbound = {
+        'ssh': InboundNetworkSecurityRule(
+            destination_port_range='22',
+            source_address_prefix=_kv_read_checked(ns_conf, 'ssh', ['*']),
+            protocol='tcp',
+        ),
+    }
+    if not isinstance(ns_inbound['ssh'].source_address_prefix, list):
+        raise ValueError('expected list for ssh network security rule')
+    if 'custom_inbound_rules' in ns_conf:
+        for key in ns_conf['custom_inbound_rules']:
+            ns_inbound[key] = InboundNetworkSecurityRule(
+                destination_port_range=_kv_read_checked(
+                    ns_conf['custom_inbound_rules'][key],
+                    'destination_port_range'),
+                source_address_prefix=_kv_read_checked(
+                    ns_conf['custom_inbound_rules'][key],
+                    'source_address_prefix'),
+                protocol=_kv_read_checked(
+                    ns_conf['custom_inbound_rules'][key], 'protocol'),
+            )
+            if not isinstance(ns_inbound[key].source_address_prefix, list):
+                raise ValueError(
+                    'expected list for network security rule {} '
+                    'source_address_prefix'.format(key))
+    # ssh settings
+    ssh_conf = conf['ssh']
+    ssh_username = _kv_read_checked(ssh_conf, 'username')
+    ssh_public_key = _kv_read_checked(ssh_conf, 'ssh_public_key')
+    if util.is_not_empty(ssh_public_key):
+        ssh_public_key = pathlib.Path(ssh_public_key)
+    ssh_public_key_data = _kv_read_checked(ssh_conf, 'ssh_public_key_data')
+    ssh_private_key = _kv_read_checked(ssh_conf, 'ssh_private_key')
+    if util.is_not_empty(ssh_private_key):
+        ssh_private_key = pathlib.Path(ssh_private_key)
+    if (ssh_public_key is not None and
+            util.is_not_empty(ssh_public_key_data)):
+        raise ValueError('cannot specify both an SSH public key file and data')
+    if (ssh_public_key is None and
+            util.is_none_or_empty(ssh_public_key_data) and
+            ssh_private_key is not None):
+        raise ValueError(
+            'cannot specify an SSH private key with no public key specified')
+    ssh_gen_file_path = _kv_read_checked(
+        ssh_conf, 'generated_file_export_path', '.')
+    return VmResource(
+        location=location,
+        resource_group=rg,
+        zone=zone,
+        hostname_prefix=hostname_prefix,
+        virtual_network=virtual_network_settings(
+            conf,
+            default_resource_group=rg,
+            default_existing_ok=False,
+            default_create_nonexistant=True,
+        ),
+        network_security=NetworkSecuritySettings(
+            inbound=ns_inbound,
+        ),
+        vm_size=vm_size,
+        accelerated_networking=accel_net,
+        public_ip=PublicIpSettings(
+            enabled=pip_enabled,
+            static=pip_static,
+        ),
+        ssh=SSHSettings(
+            username=ssh_username,
+            expiry_days=9999,
+            ssh_public_key=ssh_public_key,
+            ssh_public_key_data=ssh_public_key_data,
+            ssh_private_key=ssh_private_key,
+            generate_docker_tunnel_script=False,
+            generated_file_export_path=ssh_gen_file_path,
+            hpn_server_swap=False,
+            allow_docker_access=False,
+        ),
+    )
+
+
+def slurm_vm_count(config, kind):
+    # type: (dict, str) -> int
+    """Get Slurm controller vm count
+    :param dict config: configuration dict
+    :param str kind: kind
+    :rtype: int
+    :return: vm count
+    """
+    conf = _kv_read_checked(_kv_read_checked(config, 'slurm'), kind)
+    return _kv_read(conf, 'vm_count')
+
+
+def slurm_additional_prep_script(config, kind):
+    # type: (dict, str) -> int
+    """Get Slurm additional prep script
+    :param dict config: configuration dict
+    :param str kind: kind
+    :rtype: str
+    :return: prep script location
+    """
+    conf = _kv_read_checked(_kv_read_checked(config, 'slurm'), kind)
+    return _kv_read(conf, 'additional_prep_script')
+
+
+def slurm_shared_data_volumes(config):
+    # type: (dict) -> List[str]
+    """Get Slurm shared data volumes
+    :param dict config: configuration dict
+    :rtype: List[str]
+    :return: list of SlurmSharedDataVolumesSettings
+    """
+    conf = _kv_read_checked(config, 'slurm')
+    sdv = _kv_read_checked(conf, 'shared_data_volumes', default={})
+    vols = []
+    state = False
+    for sdkey in sdv:
+        store_slurmctld_state = _kv_read(sdv[sdkey], 'store_slurmctld_state')
+        if store_slurmctld_state:
+            if state:
+                raise ValueError(
+                    'only one shared data volume should be designated as '
+                    'store_slurmctld_state')
+            state = True
+        vols.append(SlurmSharedDataVolumesSettings(
+            id=sdkey,
+            host_mount_path=_kv_read_checked(sdv[sdkey], 'host_mount_path'),
+            store_slurmctld_state=store_slurmctld_state,
+        ))
+    return vols
+
+
+def other_storage_account_settings(config, key):
+    # type: (dict, str) ->str
+    """Get other storage account settings selector
+    :param dict config: configuration dict
+    :param str key: config key
+    :rtype: str
+    :return: other storage settings link
+    """
+    try:
+        conf = config[key]
+        if util.is_none_or_empty(conf):
+            raise KeyError
+    except KeyError:
+        raise ValueError('{} settings are invalid or missing'.format(key))
    ssel = _kv_read_checked(conf, 'storage_account_settings')
    if util.is_none_or_empty(ssel):
        raise ValueError(
-            'federation storage_account_settings are invalid or missing')
+            '{} storage_account_settings are invalid or missing'.format(key))
    return ssel


@ -4924,7 +5286,18 @@ def federation_credentials_storage(config):
    :return: federation storage cred settings
    """
    return credentials_storage(
-        config, federation_storage_account_settings(config))
+        config, other_storage_account_settings(config, 'federation'))
+
+
+def slurm_credentials_storage(config):
+    # type: (dict) -> StorageCredentialsSettings
+    """Get slurm storage account settings
+    :param dict config: configuration dict
+    :rtype: StorageCredentialsSettings
+    :return: slurm storage cred settings
+    """
+    return credentials_storage(
+        config, other_storage_account_settings(config, 'slurm'))


 def generate_availability_set_name(vr):
--- a/convoy/slurm.py
+++ b/convoy/slurm.py
--- a/convoy/storage.py
+++ b/convoy/storage.py
@ -31,7 +31,6 @@ from builtins import (  # noqa
    next, oct, open, pow, round, super, filter, map, zip)
 # stdlib imports
 import datetime
-import hashlib
 import json
 import logging
 import os
@ -81,6 +80,7 @@ _STORAGE_CONTAINERS = {
    'table_monitoring': None,
    'table_federation_global': None,
    'table_federation_jobs': None,
+    'table_slurm': None,
    'queue_federation': None,
    # TODO remove following in future release
    'table_registry': None,
@ -120,6 +120,7 @@ def set_storage_configuration(sep, postfix, sa, sakey, saep, sasexpiry):
    _STORAGE_CONTAINERS['table_monitoring'] = sep + 'monitor'
    _STORAGE_CONTAINERS['table_federation_jobs'] = sep + 'fedjobs'
    _STORAGE_CONTAINERS['table_federation_global'] = sep + 'fedglobal'
+    _STORAGE_CONTAINERS['table_slurm'] = sep + 'slurm'
    _STORAGE_CONTAINERS['queue_federation'] = sep + 'fed'
    # TODO remove following containers in future release
    _STORAGE_CONTAINERS['table_registry'] = sep + 'registry'
@ -427,8 +428,7 @@ def _add_global_resource(
                'global resource type: {}'.format(grtype))
        for gr in resources:
            resource = '{}:{}'.format(prefix, gr)
-            resource_sha1 = hashlib.sha1(
-                resource.encode('utf8')).hexdigest()
+            resource_sha1 = util.hash_string(resource)
            logger.info('adding global resource: {} hash={}'.format(
                resource, resource_sha1))
            table_client.insert_or_replace_entity(
@ -619,15 +619,6 @@ def remove_resources_from_monitoring(
                        sc_id))


-def hash_string(strdata):
-    """Hash a string
-    :param str strdata: string data to hash
-    :rtype: str
-    :return: hexdigest
-    """
-    return hashlib.sha1(strdata.encode('utf8')).hexdigest()
-
-
 def hash_pool_and_service_url(pool_id, batch_service_url):
    """Hash a pool and service url
    :param str pool_id: pool id
@ -635,7 +626,8 @@ def hash_pool_and_service_url(pool_id, batch_service_url):
    :rtype: str
    :return: hashed pool and service url
    """
-    return hash_string('{}${}'.format(batch_service_url.rstrip('/'), pool_id))
+    return util.hash_string('{}${}'.format(
+        batch_service_url.rstrip('/'), pool_id))


 def hash_federation_id(federation_id):
@ -644,7 +636,7 @@ def hash_federation_id(federation_id):
    :rtype: str
    :return: hashed federation id
    """
-    fedhash = hash_string(federation_id)
+    fedhash = util.hash_string(federation_id)
    logger.debug('federation id {} -> {}'.format(federation_id, fedhash))
    return fedhash

@ -656,7 +648,8 @@ def generate_job_id_locator_partition_key(federation_id, job_id):
    :rtype: str
    :return: hashed fedhash and job id
    """
-    return '{}${}'.format(hash_string(federation_id), hash_string(job_id))
+    return '{}${}'.format(
+        util.hash_string(federation_id), util.hash_string(job_id))


 def create_federation_id(
@ -1185,7 +1178,7 @@ def _pack_sequences(ent, unique_id):

 def _retrieve_and_merge_sequence(
        table_client, pk, unique_id, kind, target, entity_must_not_exist):
-    rk = hash_string(target)
+    rk = util.hash_string(target)
    try:
        ent = table_client.get_entity(
            _STORAGE_CONTAINERS['table_federation_jobs'], pk, rk)
@ -1335,7 +1328,7 @@ def list_blocked_actions_in_federation(
        except azure.common.AzureMissingResourceHttpError:
            pass
    else:
-        rk = hash_string(
+        rk = util.hash_string(
            job_id if util.is_not_empty(job_id) else job_schedule_id)
        try:
            entities = [table_client.get_entity(
@ -1399,7 +1392,7 @@ def list_queued_actions_in_federation(
        except azure.common.AzureMissingResourceHttpError:
            pass
    else:
-        rk = hash_string(
+        rk = util.hash_string(
            job_id if util.is_not_empty(job_id) else job_schedule_id)
        try:
            entities = [table_client.get_entity(
@ -1727,6 +1720,75 @@ def zap_unique_id_from_federation(
        print(json.dumps(rawout, sort_keys=True, indent=4))


+def create_slurm_partition(
+        table_client, queue_client, config, cluster_id, partition_name,
+        batch_service_url, pool_id, compute_node_type, max_compute_nodes,
+        hostlist):
+    partpool_hash = util.hash_string('{}-{}'.format(
+        partition_name, batch_service_url, pool_id))
+    # insert partition entity
+    entity = {
+        'PartitionKey': 'PARTITIONS${}'.format(cluster_id),
+        'RowKey': '{}${}'.format(partition_name, partpool_hash),
+        'BatchServiceUrl': batch_service_url,
+        'BatchPoolId': pool_id,
+        'ComputeNodeType': compute_node_type,
+        'HostList': hostlist,
+        'BatchShipyardSlurmVersion': 1,
+    }
+    logger.debug(
+        'inserting slurm partition {}:{} entity to table for '
+        'cluster {}'.format(partition_name, pool_id, cluster_id))
+    try:
+        table_client.insert_entity(_STORAGE_CONTAINERS['table_slurm'], entity)
+    except azure.common.AzureConflictHttpError:
+        logger.error('partition {}:{} cluster id {} already exists'.format(
+            partition_name, pool_id, cluster_id))
+        if util.confirm_action(
+                config, 'overwrite existing partition {}:{} for '
+                'cluster {}; this can result in undefined behavior'.format(
+                    partition_name, pool_id, cluster_id)):
+            table_client.insert_or_replace_entity(
+                _STORAGE_CONTAINERS['table_slurm'], entity)
+        else:
+            raise
+    # create queue
+    qname = '{}-{}'.format(cluster_id, partpool_hash)
+    logger.debug('creating queue: {}'.format(qname))
+    queue_client.create_queue(qname)
+
+
+def get_slurm_host_node_id(table_client, cluster_id, host):
+    node_id = None
+    try:
+        entity = table_client.get_entity(
+            _STORAGE_CONTAINERS['table_slurm'],
+            '{}${}'.format('HOSTS', cluster_id), host)
+        node_id = entity['BatchNodeId']
+    except (azure.common.AzureMissingResourceHttpError, KeyError):
+        pass
+    return node_id
+
+
+def clear_slurm_table_entities(table_client, cluster_id):
+    logger.debug('deleting slurm cluster {} entities in table'.format(
+        cluster_id))
+    tablename = _STORAGE_CONTAINERS['table_slurm']
+    keys = ['HOSTS', 'PARTITIONS']
+    for key in keys:
+        try:
+            pk = '{}${}'.format(key, cluster_id)
+            entities = table_client.query_entities(
+                tablename,
+                filter='PartitionKey eq \'{}\''.format(pk))
+        except azure.common.AzureMissingResourceHttpError:
+            pass
+        else:
+            batch_delete_entities(
+                table_client, tablename, pk, [x['RowKey'] for x in entities]
+            )
+
+
 def _check_file_and_upload(blob_client, file, key, container=None):
    # type: (azure.storage.blob.BlockBlobService, tuple, str, str) -> None
    """Upload file to blob storage if necessary
@ -1825,6 +1887,38 @@ def upload_for_nonbatch(blob_client, files, kind):
    return ret


+def upload_to_container(blob_client, sa, files, container, gen_sas=True):
+    # type: (azure.storage.blob.BlockBlobService,
+    #        settings.StorageCredentialsSettings, List[tuple],
+    #        str, bool) -> dict
+    """Upload files to a specific blob storage container
+    :param azure.storage.blob.BlockBlobService blob_client: blob client
+    :param settings.StorageCredentialsSettings sa: storage account
+    :param list files: files to upload
+    :param str container: container
+    :param bool gen_sas: generate a SAS URL for blob
+    :rtype: dict
+    :return: sas url dict
+    """
+    sas_urls = {}
+    for file in files:
+        _check_file_and_upload(blob_client, file, None, container=container)
+        sas_urls[file[0]] = 'https://{}.blob.{}/{}/{}'.format(
+            sa.account, sa.endpoint, container, file[0],
+        )
+        if gen_sas:
+            sas_urls[file[0]] = '{}?{}'.format(
+                sas_urls[file[0]],
+                blob_client.generate_blob_shared_access_signature(
+                    container, file[0],
+                    permission=azureblob.BlobPermissions.READ,
+                    expiry=datetime.datetime.utcnow() +
+                    datetime.timedelta(days=_DEFAULT_SAS_EXPIRY_DAYS)
+                )
+            )
+    return sas_urls
+
+
 def create_global_lock_blob(blob_client, kind):
    # type: (azure.storage.blob.BlockBlobService, str) -> None
    """Create a global lock blob
@ -1982,7 +2076,8 @@ def clear_storage_containers(
                continue
            if (key == 'table_monitoring' or
                    key == 'table_federation_global' or
-                    key == 'table_federation_jobs'):
+                    key == 'table_federation_jobs' or
+                    key == 'table_slurm'):
                continue
            try:
                _clear_table(
@ -2052,7 +2147,7 @@ def create_storage_containers_nonbatch(
    :param azure.storage.blob.BlockBlobService blob_client: blob client
    :param azure.cosmosdb.table.TableService table_client: table client
    :param azure.storage.queue.QueueService queue_service: queue client
-    :param str kind: kind, "remotefs", "monitoring" or "federation"
+    :param str kind: kind, "remotefs", "monitoring", "federation", or "slurm"
    """
    if kind == 'federation':
        create_storage_containers_nonbatch(
@ -2143,6 +2238,46 @@ def delete_storage_containers_nonbatch(
                    logger.warning('queue not found: {}'.format(contname))


+def delete_file_share_directory(storage_settings, share, directory):
+    # type: (StorageCredentialsSettings, str, str) -> None
+    """Delete file share directory recursively
+    :param StorageCredentialsSettings storage_settings: storage settings
+    :param str share: share
+    :param str directory: directory to delete
+    """
+    file_client = azurefile.FileService(
+        account_name=storage_settings.account,
+        account_key=storage_settings.account_key,
+        endpoint_suffix=storage_settings.endpoint)
+    logger.info(
+        'recursively deleting files and directories in share {} at '
+        'directory {}'.format(share, directory))
+    del_dirs = []
+    dirs = [directory]
+    while len(dirs) > 0:
+        dir = dirs.pop()
+        try:
+            objects = file_client.list_directories_and_files(
+                share, directory_name=dir)
+        except azure.common.AzureMissingResourceHttpError:
+            logger.warning('directory {} does not exist on share {}'.format(
+                directory, share))
+            continue
+        del_dirs.append(dir)
+        for obj in objects:
+            path = '{}/{}'.format(dir or '', obj.name)
+            if type(obj) == azurefile.models.File:
+                logger.debug('deleting file {} on share {}'.format(
+                    path, share))
+                file_client.delete_file(share, '', path)
+            else:
+                dirs.append(path)
+                del_dirs.append(path)
+    for dir in del_dirs[::-1]:
+        logger.debug('deleting directory {} on share {}'.format(dir, share))
+        file_client.delete_directory(share, dir)
+
+
 def delete_storage_containers_boot_diagnostics(
        blob_client, vm_name, vm_id):
    # type: (azureblob.BlockBlobService, str, str) -> None
--- a/convoy/util.py
+++ b/convoy/util.py
@ -458,6 +458,16 @@ def compute_md5_for_file(file, as_base64, blocksize=65536):
            return hasher.hexdigest()


+def hash_string(strdata):
+    # type: (str) -> str
+    """Hash a string
+    :param str strdata: string data to hash
+    :rtype: str
+    :return: hexdigest
+    """
+    return hashlib.sha1(strdata.encode('utf8')).hexdigest()
+
+
 def subprocess_with_output(
        cmd, shell=False, cwd=None, env=None, suppress_output=False):
    # type: (str, bool, str, dict, bool) -> int
@ -622,3 +632,25 @@ def ip_from_address_prefix(cidr, start_offset=None, max=None):
            last = first + max - 1
    for i in range(first, last + 1):
        yield socket.inet_ntoa(struct.pack('>L', i))
+
+
+def explode_arm_subnet_id(arm_subnet_id):
+    # type: (str) -> Tuple[str, str, str, str, str]
+    """Parses components from ARM subnet id
+    :param str arm_subnet_id: ARM subnet id
+    :rtype: tuple
+    :return: subid, rg, provider, vnet, subnet
+    """
+    tmp = arm_subnet_id.split('/')
+    try:
+        subid = tmp[2]
+        rg = tmp[4]
+        provider = tmp[6]
+        vnet = tmp[8]
+        subnet = tmp[10]
+    except IndexError:
+        raise ValueError(
+            'Error parsing arm_subnet_id. Make sure the virtual network '
+            'resource id is correct and is postfixed with the '
+            '/subnets/<subnet_id> portion.')
+    return subid, rg, provider, vnet, subnet
--- a/convoy/validator.py
+++ b/convoy/validator.py
@ -59,6 +59,7 @@ class ConfigType(enum.Enum):
    RemoteFS = 5,
    Monitor = 6,
    Federation = 7,
+    Slurm = 8,


 # global defines
@ -92,6 +93,10 @@ _SCHEMAS = {
        'name': 'Federation',
        'schema': pathlib.Path(_ROOT_PATH, 'schemas/federation.yaml'),
    },
+    ConfigType.Slurm: {
+        'name': 'Slurm',
+        'schema': pathlib.Path(_ROOT_PATH, 'schemas/slurm.yaml'),
+    },
 }

 # configure loggers
--- a/docker/slurm/centos/7/Dockerfile
+++ b/docker/slurm/centos/7/Dockerfile
@ -0,0 +1,41 @@
+# Dockerfile for Slurm on CentOS 7 for Batch Shipyard
+
+FROM centos:7
+MAINTAINER Fred Park <https://github.com/Azure/batch-shipyard>
+
+WORKDIR /tmp
+ENV SLURM_VERSION=18.08.5-2
+
+RUN yum install -y epel-release \
+    && yum makecache -y fast \
+    && yum groupinstall -y \
+        "Development Tools" \
+    && yum install -y \
+        curl \
+        file \
+        python \
+        perl-devel \
+        ruby \
+        ruby-devel \
+        munge-devel \
+        pam-devel \
+        mariadb-devel \
+        numactl-devel \
+    && gem install fpm \
+    && yum clean all
+
+RUN yum install -y numactl-devel perl-devel
+
+RUN curl -fSsL https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 | tar -jxpf - \
+    && cd slurm-${SLURM_VERSION} \
+    && ./configure --prefix=/tmp/slurm-build --sysconfdir=/etc/slurm --with-pam_dir=/usr/lib64/security/ \
+    && make -j4 \
+    && make -j4 contrib \
+    && make install \
+    && cd /root \
+    && fpm -s dir -t rpm -v 1.0 -n slurm-${SLURM_VERSION} --prefix=/usr -C /tmp/slurm-build .
+
+FROM alpine:3.9
+
+COPY --from=0 /root/slurm-*.rpm /root/
+COPY slurm*.service /root/
--- a/docker/slurm/centos/7/slurmctld.service
+++ b/docker/slurm/centos/7/slurmctld.service
@ -0,0 +1,15 @@
+[Unit]
+Description=Slurm controller daemon
+After=network.target munge.service
+ConditionPathExists=/etc/slurm/slurm.conf
+Documentation=man:slurmctld(8)
+
+[Service]
+Type=forking
+EnvironmentFile=-/etc/default/slurmctld
+ExecStart=/usr/sbin/slurmctld $SLURMCTLD_OPTIONS
+ExecReload=/bin/kill -HUP $MAINPID
+PIDFile=/var/run/slurmctld.pid
+
+[Install]
+WantedBy=multi-user.target
--- a/docker/slurm/centos/7/slurmd.service
+++ b/docker/slurm/centos/7/slurmd.service
@ -0,0 +1,19 @@
+[Unit]
+Description=Slurm node daemon
+After=network.target munge.service
+ConditionPathExists=/etc/slurm/slurm.conf
+Documentation=man:slurmd(8)
+
+[Service]
+Type=forking
+EnvironmentFile=-/etc/default/slurmd
+ExecStart=/usr/sbin/slurmd -d /usr/sbin/slurmstepd $SLURMD_OPTIONS
+ExecReload=/bin/kill -HUP $MAINPID
+PIDFile=/var/run/slurmd.pid
+KillMode=process
+LimitNOFILE=51200
+LimitMEMLOCK=infinity
+LimitSTACK=infinity
+
+[Install]
+WantedBy=multi-user.target
--- a/docker/slurm/centos/7/slurmdbd.service
+++ b/docker/slurm/centos/7/slurmdbd.service
@ -0,0 +1,15 @@
+[Unit]
+Description=Slurm DBD accounting daemon
+After=network.target munge.service
+ConditionPathExists=/etc/slurm/slurmdbd.conf
+Documentation=man:slurmdbd(8)
+
+[Service]
+Type=forking
+EnvironmentFile=-/etc/default/slurmdbd
+ExecStart=/usr/sbin/slurmdbd $SLURMDBD_OPTIONS
+ExecReload=/bin/kill -HUP $MAINPID
+PIDFile=/var/run/slurmdbd.pid
+
+[Install]
+WantedBy=multi-user.target
--- a/docker/slurm/ubuntu/16.04/Dockerfile
+++ b/docker/slurm/ubuntu/16.04/Dockerfile
@ -0,0 +1,38 @@
+# Dockerfile for Slurm on Ubuntu 16.04 for Batch Shipyard
+
+FROM ubuntu:16.04
+MAINTAINER Fred Park <https://github.com/Azure/batch-shipyard>
+
+WORKDIR /tmp
+ENV SLURM_VERSION=18.08.5-2
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        curl \
+        file \
+        python \
+        ruby \
+        ruby-dev \
+        libmunge-dev \
+        libpam0g-dev \
+        libmariadb-client-lgpl-dev \
+        libmysqlclient-dev \
+        numactl \
+    && gem install fpm \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN curl -fSsL https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 | tar -jxvpf - \
+    && cd slurm-${SLURM_VERSION} \
+    && ./configure --prefix=/tmp/slurm-build --sysconfdir=/etc/slurm --with-pam_dir=/lib/x86_64-linux-gnu/security/ \
+    && make -j4 \
+    && make -j4 contrib \
+    && make install \
+    && cd /root \
+    && fpm -s dir -t deb -v 1.0 -n slurm-${SLURM_VERSION} --prefix=/usr -C /tmp/slurm-build .
+
+FROM alpine:3.9
+
+COPY --from=0 /root/slurm-*.deb /root/
+COPY slurm*.service /root/
--- a/docker/slurm/ubuntu/16.04/slurmctld.service
+++ b/docker/slurm/ubuntu/16.04/slurmctld.service
@ -0,0 +1,15 @@
+[Unit]
+Description=Slurm controller daemon
+After=network.target munge.service
+ConditionPathExists=/etc/slurm/slurm.conf
+Documentation=man:slurmctld(8)
+
+[Service]
+Type=forking
+EnvironmentFile=-/etc/default/slurmctld
+ExecStart=/usr/sbin/slurmctld $SLURMCTLD_OPTIONS
+ExecReload=/bin/kill -HUP $MAINPID
+PIDFile=/var/run/slurmctld.pid
+
+[Install]
+WantedBy=multi-user.target
--- a/docker/slurm/ubuntu/16.04/slurmd.service
+++ b/docker/slurm/ubuntu/16.04/slurmd.service
@ -0,0 +1,19 @@
+[Unit]
+Description=Slurm node daemon
+After=network.target munge.service
+ConditionPathExists=/etc/slurm/slurm.conf
+Documentation=man:slurmd(8)
+
+[Service]
+Type=forking
+EnvironmentFile=-/etc/default/slurmd
+ExecStart=/usr/sbin/slurmd -d /usr/sbin/slurmstepd $SLURMD_OPTIONS
+ExecReload=/bin/kill -HUP $MAINPID
+PIDFile=/var/run/slurmd.pid
+KillMode=process
+LimitNOFILE=51200
+LimitMEMLOCK=infinity
+LimitSTACK=infinity
+
+[Install]
+WantedBy=multi-user.target
--- a/docker/slurm/ubuntu/16.04/slurmdbd.service
+++ b/docker/slurm/ubuntu/16.04/slurmdbd.service
@ -0,0 +1,15 @@
+[Unit]
+Description=Slurm DBD accounting daemon
+After=network.target munge.service
+ConditionPathExists=/etc/slurm/slurmdbd.conf
+Documentation=man:slurmdbd(8)
+
+[Service]
+Type=forking
+EnvironmentFile=-/etc/default/slurmdbd
+ExecStart=/usr/sbin/slurmdbd $SLURMDBD_OPTIONS
+ExecReload=/bin/kill -HUP $MAINPID
+PIDFile=/var/run/slurmdbd.pid
+
+[Install]
+WantedBy=multi-user.target
--- a/docker/slurm/ubuntu/18.04/Dockerfile
+++ b/docker/slurm/ubuntu/18.04/Dockerfile
@ -0,0 +1,38 @@
+# Dockerfile for Slurm on Ubuntu 18.04 for Batch Shipyard
+
+FROM ubuntu:18.04
+MAINTAINER Fred Park <https://github.com/Azure/batch-shipyard>
+
+WORKDIR /tmp
+ENV SLURM_VERSION=18.08.5-2
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        curl \
+        file \
+        python \
+        ruby \
+        ruby-dev \
+        libmunge-dev \
+        libpam0g-dev \
+        libmariadb-client-lgpl-dev \
+        libmysqlclient-dev \
+        numactl \
+    && gem install fpm \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN curl -fSsL https://download.schedmd.com/slurm/slurm-${SLURM_VERSION}.tar.bz2 | tar -jxvpf - \
+    && cd slurm-${SLURM_VERSION} \
+    && ./configure --prefix=/tmp/slurm-build --sysconfdir=/etc/slurm --with-pam_dir=/lib/x86_64-linux-gnu/security/ \
+    && make -j4 \
+    && make -j4 contrib \
+    && make install \
+    && cd /root \
+    && fpm -s dir -t deb -v 1.0 -n slurm-${SLURM_VERSION} --prefix=/usr -C /tmp/slurm-build .
+
+FROM alpine:3.9
+
+COPY --from=0 /root/slurm-*.deb /root/
+COPY slurm*.service /root/
--- a/docker/slurm/ubuntu/18.04/slurmctld.service
+++ b/docker/slurm/ubuntu/18.04/slurmctld.service
@ -0,0 +1,15 @@
+[Unit]
+Description=Slurm controller daemon
+After=network.target munge.service
+ConditionPathExists=/etc/slurm/slurm.conf
+Documentation=man:slurmctld(8)
+
+[Service]
+Type=forking
+EnvironmentFile=-/etc/default/slurmctld
+ExecStart=/usr/sbin/slurmctld $SLURMCTLD_OPTIONS
+ExecReload=/bin/kill -HUP $MAINPID
+PIDFile=/var/run/slurmctld.pid
+
+[Install]
+WantedBy=multi-user.target
--- a/docker/slurm/ubuntu/18.04/slurmd.service
+++ b/docker/slurm/ubuntu/18.04/slurmd.service
@ -0,0 +1,19 @@
+[Unit]
+Description=Slurm node daemon
+After=network.target munge.service
+ConditionPathExists=/etc/slurm/slurm.conf
+Documentation=man:slurmd(8)
+
+[Service]
+Type=forking
+EnvironmentFile=-/etc/default/slurmd
+ExecStart=/usr/sbin/slurmd -d /usr/sbin/slurmstepd $SLURMD_OPTIONS
+ExecReload=/bin/kill -HUP $MAINPID
+PIDFile=/var/run/slurmd.pid
+KillMode=process
+LimitNOFILE=51200
+LimitMEMLOCK=infinity
+LimitSTACK=infinity
+
+[Install]
+WantedBy=multi-user.target
--- a/docker/slurm/ubuntu/18.04/slurmdbd.service
+++ b/docker/slurm/ubuntu/18.04/slurmdbd.service
@ -0,0 +1,15 @@
+[Unit]
+Description=Slurm DBD accounting daemon
+After=network.target munge.service
+ConditionPathExists=/etc/slurm/slurmdbd.conf
+Documentation=man:slurmdbd(8)
+
+[Service]
+Type=forking
+EnvironmentFile=-/etc/default/slurmdbd
+ExecStart=/usr/sbin/slurmdbd $SLURMDBD_OPTIONS
+ExecReload=/bin/kill -HUP $MAINPID
+PIDFile=/var/run/slurmdbd.pid
+
+[Install]
+WantedBy=multi-user.target
--- a/docs/10-batch-shipyard-configuration.md
+++ b/docs/10-batch-shipyard-configuration.md
@ -26,6 +26,10 @@ Batch Shipyard.
 7. [Federation](17-batch-shipyard-configuration-federation.md) -
 Batch Shipyard federation proxy configuration. This configuration is entirely
 optional unless using the federation capabilities of Batch Shipyard.
+8. [Slurm](18-batch-shipyard-configuration-slurm.md) -
+Batch Shipyard [Slurm](https://slurm.schedmd.com/) configuration. This
+configuration is entirely optional unless using the Slurm on Batch
+capabilities of Batch Shipyard.

 Note that all potential properties are described here and that specifying
 all such properties may result in invalid configuration as some properties
--- a/docs/18-batch-shipyard-configuration-slurm.md
+++ b/docs/18-batch-shipyard-configuration-slurm.md
@ -0,0 +1,300 @@
+# Batch Shipyard Slurm Configuration
+This page contains in-depth details on how to configure a
+[Slurm](https://slurm.schedmd.com/) configuration file for Batch Shipyard.
+
+## Schema
+The Slurm schema is as follows:
+
+```yaml
+slurm:
+  storage_account_settings: mystorageaccount
+  location: <Azure region, e.g., eastus>
+  resource_group: my-slurm-rg
+  cluster_id: slurm
+  controller:
+    ssh:
+      username: shipyard
+      ssh_public_key: /path/to/rsa/publickey.pub
+      ssh_public_key_data: ssh-rsa ...
+      ssh_private_key: /path/to/rsa/privatekey
+      generated_file_export_path: null
+    public_ip:
+      enabled: true
+      static: false
+    virtual_network:
+      name: myvnet
+      resource_group: my-vnet-resource-group
+      existing_ok: false
+      address_space: 10.0.0.0/16
+      subnet:
+        name: my-slurm-controller-subnet
+        address_prefix: 10.0.1.0/24
+    network_security:
+      ssh:
+      - '*'
+      custom_inbound_rules:
+        myrule:
+          destination_port_range: 5000-5001
+          protocol: '*'
+          source_address_prefix:
+          - 1.2.3.4
+          - 5.6.7.0/24
+    vm_size: STANDARD_D2_V2
+    vm_count: 2
+    accelerated_networking: false
+    additional_prep_script: /path/to/some/script-controller.sh
+  login:
+    ssh:
+      username: shipyard
+      ssh_public_key: /path/to/rsa/publickey.pub
+      ssh_public_key_data: ssh-rsa ...
+      ssh_private_key: /path/to/rsa/privatekey
+      generated_file_export_path: null
+    public_ip:
+      enabled: true
+      static: false
+    virtual_network:
+      name: myvnet
+      resource_group: my-vnet-resource-group
+      existing_ok: false
+      address_space: 10.0.0.0/16
+      subnet:
+        name: my-slurm-login-subnet
+        address_prefix: 10.0.2.0/24
+    network_security:
+      ssh:
+      - '*'
+      custom_inbound_rules:
+        myrule:
+          destination_port_range: 5000-5001
+          protocol: '*'
+          source_address_prefix:
+          - 1.2.3.4
+          - 5.6.7.0/24
+    vm_size: STANDARD_D4_V2
+    vm_count: 1
+    accelerated_networking: false
+    additional_prep_script: /path/to/some/script-login.sh
+  shared_data_volumes:
+    nfs_server:
+      mount_path: /shared
+      store_slurmctld_state: true
+  slurm_options:
+    idle_reclaim_time: 00:15:00
+    elastic_partitions:
+      partition_1:
+        batch_pools:
+          mypool1:
+            account_service_url: https://...
+            compute_node_type: dedicated
+            max_compute_nodes: 32
+            weight: 0
+            features:
+            - arbitrary_constraint_1
+            reclaim_exclude_num_nodes: 8
+          mypool2:
+            account_service_url: https://...
+            compute_node_type: low_priority
+            max_compute_nodes: 128
+            weight: 1
+            features:
+            - arbitrary_constraint_2
+            reclaim_exclude_num_nodes: 0
+        max_runtime_limit: null
+        default: true
+      partition_2:
+        batch_pools:
+          mypool3:
+            account_service_url: https://...
+            compute_node_type: low_priority
+            max_compute_nodes: 256
+            weight: 2
+            features: []
+            reclaim_exclude_num_nodes: 0
+        max_runtime_limit: 1.12:00:00
+        default: false
+    unmanaged_partitions:
+      - partition: 'PartitionName=onprem Nodes=onprem-[0-31] Default=No MaxTime=INFINITE State=UP'
+        nodes:
+          - 'NodeName=onprem-[0-31] CPUs=512 Sockets=1 CoresPerSocket=8 ThreadsPerCore=2 RealMemory=512128 State=UNKNOWN'
+```
+
+The `slurm` property has the following members:
+
+* (required) `storage_account_settings` is the storage account link to store
+all Slurm metadata. Any `slurm` command that must store metadata or
+actions uses this storage account.
+* (required) `location` is the Azure region name for the resources, e.g.,
+`eastus` or `northeurope`.
+* (required) `resource_group` this is the resource group to use for the
+Slurm resources.
+* (required) `cluster_id` is the name of the Slurm cluster to create. This
+is also the DNS label prefix to apply to each virtual machine and resource
+allocated for the Slurm cluster. It should be unique.
+
+There are two required sections for resources that comprise the Slurm
+cluster: `controller` and `login`. The `controller` section specifies the VM
+configuration which hosts the Slurm controller (and possibly the Slurm DBD).
+The `login` section specifies the VM configuration which hosts the login nodes
+for the Slurm cluster.
+
+Both the `controller` and `login` sections have the following identical
+configuration properties:
+
+* (required) `ssh` is the SSH admin user to create on the machine.
+If you are running Batch Shipyard on Windows, please refer to
+[these instructions](85-batch-shipyard-ssh-docker-tunnel.md#ssh-keygen)
+on how to generate an SSH keypair for use with Batch Shipyard.
+    * (required) `username` is the admin user to create on all virtual machines
+    * (optional) `ssh_public_key` is the path to a pre-existing ssh public
+      key to use. If this is not specified, an RSA public/private key pair will
+      be generated for use in your current working directory (with a
+      non-colliding name for auto-generated SSH keys for compute pools, i.e.,
+      `id_rsa_shipyard_remotefs`). On Windows only, if this is option is not
+      specified, the SSH keys are not auto-generated (unless `ssh-keygen.exe`
+      can be invoked in the current working directory or is in `%PATH%`).
+      This option cannot be specified with `ssh_public_key_data`.
+    * (optional) `ssh_public_key_data` is the raw RSA public key data in
+      OpenSSH format, e.g., a string starting with `ssh-rsa ...`. Only one
+      key may be specified. This option cannot be specified with
+      `ssh_public_key`.
+    * (optional) `ssh_private_key` is the path to an existing SSH private key
+      to use against either `ssh_public_key` or `ssh_public_key_data` for
+      connecting to storage nodes and performing operations that require SSH
+      such as cluster resize and detail status. This option should only be
+      specified if either `ssh_public_key` or `ssh_public_key_data` are
+      specified.
+    * (optional) `generated_file_export_path` is an optional path to specify
+      for where to create the RSA public/private key pair.
+* (optional) `public_ip` are public IP properties for the virtual machine.
+    * (optional) `enabled` designates if public IPs should be assigned. The
+      default is `true`. Note that if public IP is disabled, then you must
+      create an alternate means for accessing the Slurm resource virtual
+      machine through a "jumpbox" on the virtual network. If this property
+      is set to `false` (disabled), then any action requiring SSH, or the
+      SSH command itself, will occur against the private IP address of the
+      virtual machine.
+    * (optional) `static` is to specify if static public IPs should be assigned
+      to each virtual machine allocated. The default is `false` which
+      results in dynamic public IP addresses. A "static" FQDN will be provided
+      per virtual machine, regardless of this setting if public IPs are
+      enabled.
+* (required) `virtual_network` is the virtual network to use for the
+Slurm resource.
+    * (required) `name` is the virtual network name
+    * (optional) `resource_group` is the resource group for the virtual
+      network. If this is not specified, the resource group name falls back
+      to the resource group specified in the Slurm resource.
+    * (optional) `existing_ok` allows use of a pre-existing virtual network.
+      The default is `false`.
+    * (required if creating, optional otherwise) `address_space` is the
+      allowed address space for the virtual network.
+    * (required) `subnet` specifies the subnet properties.
+        * (required) `name` is the subnet name.
+        * (required) `address_prefix` is the subnet address prefix to use for
+          allocation of the Slurm resource virtual machine to.
+* (required) `network_security` defines the network security rules to apply
+to the Slurm resource virtual machine.
+    * (required) `ssh` is the rule for which address prefixes to allow for
+      connecting to sshd port 22 on the virtual machine. In the example, `"*"`
+      allows any IP address to connect. This is an array property which allows
+      multiple address prefixes to be specified.
+    * (optional) `grafana` rule allows grafana HTTPS (443) server port to be
+      exposed to the specified address prefix. Multiple address prefixes
+      can be specified.
+    * (optional) `prometheus` rule allows the Prometheus server port to be
+      exposed to the specified address prefix. Multiple address prefixes
+      can be specified.
+    * (optional) `custom_inbound_rules` are custom inbound rules for other
+      services that you need to expose.
+        * (required) `<rule name>` is the name of the rule; the example uses
+          `myrule`. Each rule name should be unique.
+            * (required) `destination_port_range` is the ports on each virtual
+              machine that will be exposed. This can be a single port and
+              should be a string.
+            * (required) `source_address_prefix` is an array of address
+              prefixes to allow.
+        * (required) `protocol` is the protocol to allow. Valid values are
+          `tcp`, `udp` and `*` (which means any protocol).
+* (required) `vm_size` is the virtual machine instance size to use.
+* (required) `vm_count` is the number of virtual machines to allocate of
+this instance type. For `controller`, a value greater than `1` will create
+a HA Slurm cluster. Additionally, a value of greater than `1` will
+automatically place the VMs in an availability set.
+* (optional) `accelerated_networking` enables or disables
+[accelerated networking](https://docs.microsoft.com/azure/virtual-network/create-vm-accelerated-networking-cli).
+The default is `false` if not specified.
+* (optional) `additional_prep_script` property specifies a local file which
+will be uploaded then executed for additional prep/configuration that should
+be applied to each Slurm resource.
+
+There are two required sections for specifying how the Slurm
+cluster is configured: `shared_data_volumes` and `slurm_options` sections.
+The `shared_data_volumes` section configures shared file systems (or
+RemoteFS clusters as provisioned by Batch Shipyard). The `slurm_options`
+section configures the Slurm partitions.
+
+The following describes the `shared_data_volumes` configuration:
+
+* (required) Storage cluster id is a named dictionary key that refers
+to a defined storage cluster in the global configuration file (and
+subsequently the RemoteFS configuration).
+    * (required) `mount_path` is the mount path across all Slurm resources
+      and compute nodes.
+    * (required) `store_slurmctld_state` designates this shared data volume
+      as the volume that hosts the slurmctld state for HA failover.
+
+The following describes the `slurm_options` configuration:
+
+* (required) `idle_reclaim_time` specifies the amount of time required to
+pass while nodes are idle for them to be reclaimed (or suspended) by Slurm.
+The format for this property is a timedelta with a string
+representation of "d.HH:mm:ss". "HH:mm:ss" is required but "d" is optional.
+* (required) `elastic_partitions` specifies the Slurm partitions to create
+  for elastic cloud bursting onto Azure Batch
+    * (required) Unique name of the partition
+        * (required) `batch_pools` specifies the Batch pools which will be
+          dynamically sized by Batch Shipyard and Slurm. All Batch pools
+          should be pre-allocated (unless using the `orchestrate` command
+          in conjunction with using one pool) with 0 nodes.
+            * (required) Batch Pool Id
+                * (optional) `account_service_url` is the Batch account
+                  service URL associated with this Batch pool. Currently,
+                  this is restricted to the service url specified in the
+                  credentials file.
+                * (required) `compute_node_type` is the compute node type
+                  to allocate, can be either `dedicated` or `low_priority`.
+                * (required) `max_compute_nodes` is the maximum number of
+                  compute nodes that can be allocated.
+                * (required) `weight` is this weight for this Batch pool in
+                  this partition. See the Slurm documentation for more details.
+                * (optional) `features` are additional features labeled on
+                  this partition.
+                * (optional) `reclaim_exclude_num_nodes` is the number of
+                  nodes to exclude from reclaiming for this Batch pool.
+        * (optional) `max_runtime_limit` imposes a maximum runtime limit
+          for this partition. The format for this property is a timedelta
+          with a string representation of "d.HH:mm:ss". "HH:mm:ss" is
+          required but "d" is optional.
+        * (required) `default` designates this partition as the default
+          partition.
+* (optional) `unmanaged_partitions` specifies partitions which are not
+managed by Batch Shipyard but those that you wish to join to the Slurm
+controller. This is useful for joining on-premises nodes within the same
+Virtual Network (or peered) to the Slurm cluster. Each sequence member
+has the properties:
+    * (required) `partition` specifies the partition entry in the Slurm
+      configuration file.
+    * (required) `nodes` is a sequence of Slurm node entries in the Slurm
+      configuration file as it relates to the partition.
+
+## Slurm with Batch Shipyard Guide
+Please see the [full guide](69-batch-shipyard-slurm.md) for
+relevant terminology and information on how this feature works in Batch
+Shipyard.
+
+## Full template
+A full template of a Slurm cluster configuration file can be found
+[here](https://github.com/Azure/batch-shipyard/tree/master/config_templates).
+Note that these templates cannot be used as-is and must be modified to fit
+your scenario.
--- a/docs/20-batch-shipyard-usage.md
+++ b/docs/20-batch-shipyard-usage.md
@ -225,12 +225,14 @@ instead:
  cert      Certificate actions
  data      Data actions
  diag      Diagnostics actions
+  fed       Federation actions
  fs        Filesystem in Azure actions
  jobs      Jobs actions
  keyvault  KeyVault actions
  misc      Miscellaneous actions
  monitor   Monitoring actions
  pool      Pool actions
+  slurm     Slurm on Batch actions
  storage   Storage actions
 ```

@ -238,6 +240,7 @@ instead:
 * `cert` commands deal with certificates to be used with Azure Batch
 * `data` commands deal with data ingress and egress from Azure
 * `diag` commands deal with diganostics for Azure Batch
+* `fed` commandsd del with Batch Shipyard Federations
 * `fs` commands deal with Batch Shipyard provisioned remote filesystems in
 Azure
 * `jobs` commands deal with Azure Batch jobs and tasks
@ -246,6 +249,7 @@ Shipyard
 * `misc` commands are miscellaneous commands that don't fall into other
 categories
 * `pool` commands deal with Azure Batch pools
+* `slurm` commands deal with Slurm on Batch
 * `storage` commands deal with Batch Shipyard metadata on Azure Storage

 ## `account` Command
@ -499,14 +503,15 @@ parts of a remote filesystem:
 ### `fs cluster` Command
 `fs cluster` command has the following sub-commands:
 ```
-  add      Create a filesystem storage cluster in Azure
-  del      Delete a filesystem storage cluster in Azure
-  expand   Expand a filesystem storage cluster in Azure
-  resize   Resize a filesystem storage cluster in Azure.
-  ssh      Interactively login via SSH to a filesystem...
-  start    Starts a previously suspended filesystem...
-  status   Query status of a filesystem storage cluster...
-  suspend  Suspend a filesystem storage cluster in Azure
+  add          Create a filesystem storage cluster in Azure
+  del          Delete a filesystem storage cluster in Azure
+  expand       Expand a filesystem storage cluster in Azure
+  orchestrate  Orchestrate a filesystem storage cluster in Azure with the...
+  resize       Resize a filesystem storage cluster in Azure.
+  ssh          Interactively login via SSH to a filesystem storage cluster...
+  start        Starts a previously suspended filesystem storage cluster in...
+  status       Query status of a filesystem storage cluster in Azure
+  suspend      Suspend a filesystem storage cluster in Azure
 ```
 As the `fs.yaml` configuration file can contain multiple storage cluster
 definitions, all `fs cluster` commands require the argument
@ -534,6 +539,8 @@ storage cluster to perform actions against.
 the file server.
    * `--no-rebalance` rebalances the data and metadata among the disks for
      better data spread and performance after the disk is added to the array.
+* `orchestrate` will create the remote disks and the remote fs cluster as
+defined in the fs config file
 * `resize` resizes the storage cluster with additional virtual machines as
 specified in the configuration. This is an experimental feature.
 * `ssh` will interactively log into a virtual machine in the storage cluster.
@ -915,6 +922,73 @@ configuration file to all nodes in the specified pool
 * `user del` will delete the SSH or RDP user defined in the pool
 configuration file from all nodes in the specified pool

+## `slurm` Command
+The `slurm` command has the following sub-commands:
+```
+  cluster  Slurm cluster actions
+  ssh      Slurm SSH actions
+```
+
+The `slurm cluster` sub-command has the following sub-sub-commands:
+```
+  create       Create a Slurm cluster with controllers and login nodes
+  destroy      Destroy a Slurm controller
+  orchestrate  Orchestrate a Slurm cluster with shared file system and
+               Batch...
+  status       Query status of a Slurm controllers and login nodes
+```
+
+The `slurm ssh` sub-command has the following sub-sub-commands:
+```
+  controller  Interactively login via SSH to a Slurm controller virtual...
+  login       Interactively login via SSH to a Slurm login/gateway virtual...
+  node        Interactively login via SSH to a Slurm compute node virtual...
+```
+
+* `cluster create` will create the Slurm controller and login portions of
+the cluster
+* `cluster destroy` will destroy the Slurm controller and login portions of
+the cluster
+    * `--delete-resource-group` will delete the entire resource group that
+      contains the Slurm resources. Please take care when using this
+      option as any resource in the resoure group is deleted which may be
+      other resources that are not Batch Shipyard related.
+    * `--delete-virtual-network` will delete the virtual network and all of
+      its subnets
+    * `--generate-from-prefix` will attempt to generate all resource names
+      using conventions used. This is helpful when there was an issue with
+      creation/deletion and the original virtual machine resources
+      cannot be enumerated. Note that OS disks cannot be deleted with this
+      option. Please use an alternate means (i.e., the Azure Portal) to
+      delete disks that may have been used by the Slurm resource VMs.
+    * `--no-wait` does not wait for deletion completion. It is not recommended
+      to use this parameter.
+* `cluster orchestrate` will orchestrate the entire Slurm cluster with a
+single Batch pool
+    * `--storage-cluster-id` will orchestrate the specified RemoteFS shared
+      file system
+* `cluster status` queries the status of the Slurm controller and login nodes
+* `ssh controller` will SSH into the Slurm controller nodes if permitted with
+the controller SSH user
+    * `COMMAND` is an optional argument to specify the command to run. If your
+      command has switches, preface `COMMAND` with double dash as per POSIX
+      convention, e.g., `pool ssh -- sudo docker ps -a`.
+    * `--offset` is the cardinal offset of the controller node
+    * `--tty` allocates a pseudo-terminal
+* `ssh login` will SSH into the Slurm login nodes with the cluster user
+identity
+    * `COMMAND` is an optional argument to specify the command to run. If your
+      command has switches, preface `COMMAND` with double dash as per POSIX
+      convention, e.g., `pool ssh -- sudo docker ps -a`.
+    * `--offset` is the cardinal offset of the login node
+    * `--tty` allocates a pseudo-terminal
+* `ssh node` will SSH into a Batch compute node with the cluster user identity
+    * `COMMAND` is an optional argument to specify the command to run. If your
+      command has switches, preface `COMMAND` with double dash as per POSIX
+      convention, e.g., `pool ssh -- sudo docker ps -a`.
+    * `--node-name` is the required Slurm node name
+    * `--tty` allocates a pseudo-terminal
+
 ## `storage` Command
 The `storage` command has the following sub-commands:
 ```
--- a/docs/69-batch-shipyard-slurm.md
+++ b/docs/69-batch-shipyard-slurm.md
@ -0,0 +1,319 @@
+# Slurm on Batch with Batch Shipyard
+The focus of this article is to explain the Slurm on Batch functionality
+in Batch Shipyard and how to effectively deploy your workload for
+traditional lift-and-shift scheduling while leveraging some
+Platform-as-a-Service capabilities of Azure Batch.
+
+## Overview
+The [Slurm](https://slurm.schedmd.com/) workload manager is an open-source
+job scheduler that is widely used among many institutional and supercomputing
+sites. Azure Batch provides an abstraction for managing lower-layer VM
+complexities and automated recovery through Batch pools. Batch Shipyard
+provides an integration between Slurm and Batch pools where the Slurm cluster
+controller and login nodes are provisioned and connected to compute nodes in
+Batch pools in an on-demand fashion.
+
+### Why?
+Why is this feature useful when you can use
+[Azure Batch](https://azure.microsoft.com/services/batch/) natively as a job
+scheduler or leverage
+[Azure CycleCloud](https://azure.microsoft.com/features/azure-cyclecloud/)?
+
+Some users or organizations may prefer the use of Slurm native
+tooling and execution workflows which are not currently possible with Azure
+Batch; either due to workflow familiarity or existing investments in the
+ecosystem. Additionally, Azure Batch may not provide some of the rich job
+scheduling and accounting functionality available in Slurm that may be
+required for some organizational workflows. Moreover, some requirements such
+as standing up a separate VM for CycleCloud or managing the underlying
+Slurm compute node infrastructure may not be an amenable solution for some
+users or organizations.
+
+Slurm on Batch with Batch Shipyard attempts to mix the advantages of both
+worlds by combining the Slurm scheduler with platform benefits of Azure Batch
+compute node orchestration and management.
+
+## Major Features
+* Simple and automated Slurm cluster creation
+* Automatic HA support of Slurm controllers and the ability to create
+multiple login nodes
+* Ability to specify arbitrary elastic partitions which may be comprised of a
+non-hetergeneous mixture of Batch pools
+* Automatic linking of shared file systems (RemoteFS clusters) between
+all Slurm resources
+* Support for concurrent dedicated and low priority compute nodes within
+partitions
+* Automatic feature tagging of nodes, including VM size and capabilities
+* Automatic generic resource configuration for GPU VMs
+* Automatic on-demand resizing of compute node resources including
+user-specified idle reclaim timeouts and node reclaim exclusion filters
+* Support for custom preparation scripts on all Slurm resources
+* Goal-seeking engine to recover from compute node allocation failures
+* Default cluster user SSH is linked to login nodes and compute nodes for
+easy logins and file access across non-controller resources
+* Supports most Batch Shipyard configuration options on the pool, including
+distributed scratch, container runtime installations, monitoring integration,
+shared file system mounting, automatic GPU setup, etc.
+* Supports joining pre-existing partitions and nodes which may be on-premises
+with elastic on-demand nodes
+
+## Mental Model
+### Slurm Dynamic Node Allocation and Deallocation
+A Slurm cluster on Batch with Batch Shipyard utilizes the
+[Slurm Elastic Computing (Cloud Bursting)](https://slurm.schedmd.com/elastic_computing.html)
+functionality which is based on Slurm's
+[Power Save](https://slurm.schedmd.com/power_save.html) capabilities.
+In a nutshell, Slurm will `resume` nodes when needed to process jobs and
+`suspend` nodes once there is no need for the nodes to run (i.e., relinquish
+them back to the cloud).
+
+When Slurm decides that new nodes should be provisioned, the `resume`
+command triggers the `Batch Shipyard Slurm Helper` which allocates compute
+nodes on the appropriate Batch pool targeting a specific Azure region.
+Batch Shipyard handles the complexity of node name assignment, host DNS
+registration, and ensuring the controller updates the node information with
+the appropriate IP address.
+
+When Slurm decides that nodes should be removed via `suspend`, the
+`Batch Shipyard Slurm Helper` will deallocate these nodes in their
+respective pools and release the node names back for availability.
+
+### Batch Pools as Slurm Compute Nodes
+A Batch Shipyard provisioned Slurm cluster is built on top of different
+resources in Azure. To more readily explain the concepts that form a Batch
+Shipyard Slurm cluster, let's start with a high-level conceptual
+layout of all of the components and possible interactions.
+
+```
+                                   +---------------+
+                                   |               |
+   +----------+  +-----------------> Azure Storage <----------------+
+   |          |  |                 |               |                |
+   | Azure    |  |                 +---------------+                |
+   | Resource |  |                                                  |
+   | Manager  |  |                  +-------------+                 |
+   |          |  |                  |             |                 |
+   +------^---+  |    +-------------> Azure Batch +------------+    |
+          |      |    |             |             |            |    |
+      MSI |  MSI |    | MSI         +-------------+            |    |
+          |      |    |                                        |    |
+-------------------------------------------------------------------------------+
+|         |      |    |                                        |    |           |
+|         |      |    |                                   +----v----+--------+  |
+|  +------------------------+                             |                  |  |
+|  |      |      |    |     |                             |    +--------+    |  |
+|  |   +--+------+----+-+   |                             |    |        |    |  |
+|  |   |                |   <----------------------------->    | slurmd |    |  |
+|  |   | Batch Shipyard |   |                             |    |        |    |  |
+|  |   | Slurm Helper   |   |                             |    +--------+    |  |
+|  |   |                |   |                             |                  |  |
+|  |   +----------------+   |      +----------------+     | +--------------+ |  |
+|  |                        |      |                |     | |              | |  |
+|  |     +-----------+      |      | Batch Shipyard |     | | Slurm client | |  |
+|  |     |           |      |      | Remote FS VMs  |     | | tools        | |  |
+|  |     | slurmctld |      |      |                |     | |              | |  |
+|  |     |           |      +------>    Subnet A    <-----+ +--------------+ |  |
+|  |     +-----------+      |      |    10.0.1.0/24 |     |                  |  |
+|  |                        |      +-------^--------+     |  Azure Batch     |  |
+|  | Slurm Controller Nodes |              |              |  Compute Nodes   |  |
+|  |                        |              |              |                  |  |
+|  |            Subnet B    |              |              |      Subnet D    |  |
+|  |            10.0.2.0/24 |              |              |      10.1.0.0/16 |  |
+|  +----------^-------------+              |              +------------------+  |
+|             |                   +--------+---------+                          |
+|             |                   |                  |                          |
+|             |                   | +--------------+ |                          |
+|             |                   | |              | |                          |
+|             +-------------------+ | Slurm client | |                          |
+|                                 | | tools        | |                          |
+|                                 | |              | |                          |
+|                                 | +--------------+ |                          |
+|                                 |                  |                          |
+|                                 |   Login Nodes    |                          |
+|                                 |                  |                          |
+|                                 |      Subnet C    |                          |
+|                                 |      10.0.3.0/24 |                          |
+| Virtual Network                 +---------^--------+                          |
+| 10.0.0.0/8                                |                                   |
+-------------------------------------------------------------------------------+
+                                            |
+                                        SSH |
+                                            |
+                                    +-------+------+
+                                    |              |
+                                    | Cluster User |
+                                    |              |
+                                    +--------------+
+```
+
+The base layer for all of the resources within a Slurm cluster on Batch is
+an Azure Virtual Network. This virtual network can be shared
+amongst other network-level resources such as network interfaces. The virtual
+network can be "partitioned" into sub-address spaces through the use of
+subnets. In the example above, we have four subnets where
+`Subnet A 10.0.1.0/24` hosts the Batch Shipyard RemoteFS shared file system,
+`Subnet B 10.0.2.0/24` contains the Slurm controller nodes,
+`Subnet C 10.0.3.0/24` contains the login nodes,
+and `Subnet D 10.1.0.0/16` contains a pool or a collection of pools of
+Azure Batch compute nodes to serve as dynamically allocated Slurm
+compute nodes.
+
+One (or more) RemoteFS shared file systems can be used as a common file system
+between login nodes and the Slurm compute nodes (provisioned as Batch compute
+nodes). One of these file systems is also designated to store `slurmctld`
+state for HA/failover for standby Slurm controller nodes. Cluster users
+login to the Slurm cluster via the login nodes where the shared file system
+is mounted and the Slurm client tools are installed which submit to the
+controller nodes.
+
+Slurm configuration and munge keys are propagated to the provisioned compute
+nodes in Batch pools along with mounting the appropriate RemoteFS shared
+file systems. Once these nodes are provisioned and idle, the node information
+is updated on the controller nodes to be available for Slurm job scheduling.
+
+When Slurm signals that nodes are no longer needed, the Batch Shipyard
+Slurm helper will then translate the Slurm node names back to Batch compute
+node ids and deprovision appropriately.
+
+## Walkthrough
+The following is a brief walkthrough of configuring a Slurm on Batch cluster
+with Batch Shipyard.
+
+### Azure Active Directory Authentication Required
+Azure Active Directory authentication is required to create a Slurm cluster.
+When executing either the `slurm cluster create` or `slurm cluster orchestrate`
+command, your service principal must be at least `Owner` or a
+[custom role](https://docs.microsoft.com/azure/active-directory/role-based-access-control-custom-roles)
+that does not prohibit the following action along with the ability to
+create/read/write resources for the subscription:
+
+* `Microsoft.Authorization/*/Write`
+
+This action is required to enable
+[Azure Managed Service Identity](https://docs.microsoft.com/azure/active-directory/managed-service-identity/overview)
+on the Batch Shipyard Slurm Helper which runs on controller nodes.
+
+### Configuration
+The configuration for a Slurm cluster with Batch Shipyard is generally
+composed of two major parts: the Slurm configuration the normal global and
+pool configurations.
+
+#### Slurm Cluster Configuration
+The Slurm cluster configuration is defined by a Slurm configuration
+file. Please refer to the full
+[Slurm cluster configuration documentation](18-batch-shipyard-configuration-slurm.md)
+for more detailed explanations of each option and for those not shown below.
+
+Conceptually, this file consists of five major parts:
+
+```yaml
+slurm:
+  # 1. define general settings
+  storage_account_settings: mystorageaccount
+  location: <Azure region, e.g., eastus>
+  resource_group: my-slurm-rg
+  cluster_id: slurm
+  # 2. define controller node settings
+  controller:
+    ssh:
+      # SSH access/user to the controller nodes, independent of other resources
+    public_ip:
+      # ...
+    virtual_network:
+      # Virtual Network should be the same for all resources, with a differing subnet
+    network_security:
+      # Optional, but recommended network security rules
+    vm_size: # appropriate VM size
+    vm_count: # Number greater than 1 will create an HA Slurm cluster
+  # 3. define login node settings
+  login:
+    ssh:
+      # The cluster user SSH and username settings
+    public_ip:
+      # ...
+    virtual_network:
+      # Virtual Network should be the same for all resources, with a differing subnet
+    network_security:
+      # Optional, but recommended network security rules
+    vm_size: # appropriate VM size
+    vm_count: # Number greater than 1 will create multiple login nodes
+  # 4. define shared file systems
+  shared_data_volumes:
+    nfs_server: # Batch Shipyard RemoteFS storage cluster id
+      mount_path: # The mount path across all Slurm resources
+      store_slurmctld_state: # at least one shared data volume must set this to true
+  # 5. define Slurm options
+  slurm_options:
+    idle_reclaim_time: # amount of idle time before Slurm issues suspend on nodes
+    elastic_partitions: # define Slurm elastic cloud bursting partitions
+      partition_1:
+        batch_pools:
+          mypool1: # pool id, must be pre-allocated with zero nodes
+            account_service_url: https://... # currently this must be the same as the Batch account specified in config.yaml
+            compute_node_type: # dedicated or low_priority nodes
+            max_compute_nodes: # maximum number of VMs to allocate
+            weight: # Slurm weight
+            features:
+              # arbitrary constraint sequence
+            reclaim_exclude_num_nodes: # number of nodes to exclude from idle reclaim.
+                                       # Once allocated, these number of nodes are not reclaimed.
+          # can define multiple pools
+        max_runtime_limit: # maximum runtime for jobs in this partition
+        default: # is the default partition, one partition must have this set to true
+    unmanaged_partitions:
+      # for pre-existing partitions (or on-prem partitions)
+```
+
+#### Global Configuration
+[Global configuration](12-batch-shipyard-configuration-global.md) should
+contain the appropriate RemoteFS shared file system/data volumes that are
+to be used across all Slurm resources under
+`global_resources`:`volumes`:`shared_data_volumes`. More than one RemoteFS
+shared data volume can be specified.
+
+Optionally, if your workload will be container driven, you can specify
+image pre-loads here as per normal convention under `global_resources`.
+
+#### Pool Configuration
+[Pool configuration](13-batch-shipyard-configuration-pool.md) should
+be used to create all necessary pools used for Slurm elastic partitions
+beforehand. This file is not explicitly used for `slurm cluster create` and
+only for `slurm cluster orchestrate` if orchestrating a Slurm cluster with
+one pool. If not utilizing the orchestrate command, then it is required
+to create pools individually before issuing `slurm cluster create`.
+
+Most pool properties apply with no modifications for Slurm clusters. By
+default, all Slurm nodes have Docker installed. Do not use `native` mode
+for Slurm compute nodes.
+
+### Limitations
+This is a non-exhaustive list of potential limitations while using
+the Slurm on Batch feature in Batch Shipyard.
+
+* All pools must reside under the Batch account linked to the global
+configuration. This limitation will be lifted at a later date.
+* Shared file system (shared data volume) support is currently limited
+to supported RemoteFS provisioned storage clusters: NFS and GlusterFS.
+* Network Security Groups (NSGs) should permit communication between
+Slurm resources for all required communication channels and ports.
+* LDAP for centralized user control is not implemented, but can be
+customized per the `additional_prep_script` option on the `controller` and
+`login` section of the Slurm configuration file and using
+`additional_node_prep` for compute nodes.
+* PAM-based auth restrictions for preventing users from logging into
+compute nodes without a running job is not yet implemented.
+* An action aggregator in the Batch Shipyard Slurm helper that would
+improve resize operation performance is not yet implemented.
+* Suspending and resuming the Slurm controller and login nodes is not
+yet supported.
+
+### Quotas
+Ensure that you have sufficient core and pool quota for your Batch account.
+Please note that *all* quotas (except for the number of Batch accounts
+per region per subscription) apply to each individual Batch account
+separately. User subscription based Batch accounts share the underlying
+subscription regional core quotas.
+
+## Sample Usage
+Please see the sample [Slurm recipe](../recipes/Slurm-NFS) for a working
+example.
--- a/docs/80-batch-shipyard-multi-instance-tasks.md
+++ b/docs/80-batch-shipyard-multi-instance-tasks.md
@ -148,7 +148,7 @@ the user that will execute the task should be present within the Docker
 container. SSH clients will also need to be transparently directed to
 connect to the alternate port and ignore input prompts since these
 programs will be run in non-interactive mode. If you cannot override your MPI
-runtime remote shell options, you can use an SSH config file stored in the
+runtime remote shell options, you can use an SSH `config` file stored in the
 respective root or user's `.ssh` directory alongside the keys:

 ```
--- a/docs/97-faq.md
+++ b/docs/97-faq.md
@ -60,9 +60,9 @@ factors that Batch Shipyard has no control over.
 #### What is `native` under pool `platform_image` and `custom_image`?
 `native` designates to Batch Shipyard to attempt to create the pool such
 that the pool works under native Docker mode where the compute nodes
-understand how to launch and execute Docker containers. Please understand
-that only a subset of `platform_image` combinations are compatible with
-`native` mode. You can refer to the
+"natively" understand how to launch and execute Docker containers. Please
+understand that only a subset of `platform_image` combinations are compatible
+with `native` mode. You can refer to the
 [Batch Shipyard Platform Image support doc](25-batch-shipyard-platform-image-support.md)
 for more information. Compliant
 [custom images](63-batch-shipyard-custom-images.md) are compatible with
--- a/docs/99-current-limitations.md
+++ b/docs/99-current-limitations.md
@ -43,7 +43,7 @@ is found through `%PATH%` or is in the current working directory.
 * Compute pool resize down (i.e., removing nodes from a pool) is not supported
 when peer-to-peer transfer is enabled.
 * The maximum number of compute nodes with peer-to-peer enabled is currently
-40 for Linux pools for non-UserSubscription Batch accounts. This check is
+100 for Linux pools for non-UserSubscription Batch accounts. This check is
 no longer performed before a pool is created and will instead result in
 a ResizeError on the pool if not all compute nodes can be allocated.
 * Data movement between Batch tasks as defined by `input_data`:`azure_batch`
--- a/federation/federation.py
+++ b/federation/federation.py
@ -479,7 +479,7 @@ class ServiceProxy():
                return self.batch_clients[batch_account]
            except KeyError:
                client = azure.batch.BatchServiceClient(
-                    self.creds.batch_creds, base_url=service_url)
+                    self.creds.batch_creds, batch_url=service_url)
                self._modify_client_for_retry_and_user_agent(client)
                self.batch_clients[batch_account] = client
                logger.debug('batch client created for account: {}'.format(
--- a/heimdall/heimdall.py
+++ b/heimdall/heimdall.py
@ -281,7 +281,7 @@ def _get_batch_credentials(
        return _BATCH_CLIENTS[batch_account]
    except KeyError:
        creds = create_msi_credentials(cloud, resource_id=resource_id)
-        client = azure.batch.BatchServiceClient(creds, base_url=service_url)
+        client = azure.batch.BatchServiceClient(creds, batch_url=service_url)
        _modify_client_for_retry_and_user_agent(client)
        _BATCH_CLIENTS[batch_account] = client
        logger.debug('batch client created for account: {}'.format(
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -20,6 +20,7 @@ pages:
  - RemoteFS: 15-batch-shipyard-configuration-fs.md
  - Monitoring: 16-batch-shipyard-configuration-monitor.md
  - Federation: 17-batch-shipyard-configuration-federation.md
+  - Slurm : 18-batch-shipyard-configuration-federation.md
 - CLI Commands and Usage: 20-batch-shipyard-usage.md
 - Platform Image support: 25-batch-shipyard-platform-image-support.md
 - In-Depth Feature Guides:
@ -31,6 +32,7 @@ pages:
  - Remote Filesystems: 65-batch-shipyard-remote-fs.md
  - Resource Monitoring: 66-batch-shipyard-resource-monitoring.md
  - Federations: 68-batch-shipyard-federation.md
+  - Slurm on Batch: 69-batch-shipyard-slurm.md
  - Data Movement: 70-batch-shipyard-data-movement.md
  - Azure KeyVault for Credential Management: 74-batch-shipyard-azure-keyvault.md
  - Credential Encryption: 75-batch-shipyard-credential-encryption.md
--- a/recipes/BLAST-CPU/README.md
+++ b/recipes/BLAST-CPU/README.md
@ -66,7 +66,8 @@ shipyard jobs tasks list --jobid blast --taskid merge-task-00001 --poll-until-ta
 # optionally egress the results.txt file from the compute node to local machine
 shipyard data files task --filespec blast,merge-task-00001,wd/results.txt

-# clean-up
-shipyard jobs del -y --wiat jobs-blast.yaml
+# clean up
+shipyard jobs del -y jobs-split.yaml
+shipyard jobs del -y jobs-blast.yaml
 shipyard pool del -y
 ```
--- a/recipes/README.md
+++ b/recipes/README.md
@ -23,8 +23,9 @@ Use the following links to quickly navigate to recipe collections:
 4. [Genomics and Bioinformatics](#genomics)
 5. [Molecular Dynamics (MD)](#md)
 6. [RemoteFS](#remotefs)
-7. [Video Processing](#video)
-8. [Windows](#windows)
+7. [Slurm on Batch](#slurm)
+8. [Video Processing](#video)
+9. [Windows](#windows)

 ## <a name="benchmarks"></a>Benchmarks
 #### [HPCG-Infiniband-IntelMPI](./HPCG-Infiniband-IntelMPI)
@ -193,6 +194,11 @@ GlusterFS storage cluster.
 This RemoteFS-NFS recipe contains information on how to provision a sample
 single VM NFS server.

+## <a name="slurm"></a>Slurm on Batch
+#### [Slurm+NFS](./Slurm+NFS)
+This recipe contains information on how to orchestrate a
+[Slurm](https://slurm.schedmd.com/) cluster with an NFS shared file system.
+
 ## <a name="video"></a>Video Processing
 #### [FFmpeg-GPU](./FFmpeg-GPU)
 This recipe contains information on how to containerize
--- a/recipes/RemoteFS-GlusterFS+BatchPool/README.md
+++ b/recipes/RemoteFS-GlusterFS+BatchPool/README.md
@ -49,7 +49,7 @@ or unspecified

 ### Batch Shipyard Commands
 After you have created your RemoteFS GlusterFS storage cluster via
-`fs cluster add`, then you can issue `pool add` with the above config
+`fs cluster orchestrate`, then you can issue `pool add` with the above config
 which will create a Batch pool and automatically link your GlusterFS
 storage cluster against your Batch pool. You can then use data placed on
 the storage cluster in your containerized workloads.
--- a/recipes/RemoteFS-GlusterFS+BatchPool/config/fs.yaml
+++ b/recipes/RemoteFS-GlusterFS+BatchPool/config/fs.yaml
@ -2,7 +2,7 @@ remote_fs:
  resource_group: my-resource-group
  location: <Azure region, e.g., eastus>
  managed_disks:
-    premium: true
+    sku: premium_lrs
    disk_size_gb: 1023
    disk_names:
    - p30-disk0a
--- a/recipes/RemoteFS-GlusterFS/README.md
+++ b/recipes/RemoteFS-GlusterFS/README.md
@ -45,20 +45,15 @@ there are at least 2 VMs, thus disks should be mapped in their respective
 cardinal entry.

 ### Commands to create the GlusterFS storage cluster
-After modifying the configuration files as required, then you must create
-the managed disks as the first step. The following assumes the configuration
-files are in the current working directory. First all of the managed disks
-used by the storage cluster must be provisioned:
+After modifying the configuration files as required, you can orchestrate
+the entire GlusterFS shared file system with `fs cluster orchestrate`. The
+`orchestrate` command wraps up the disk allocation (`fs disks add`) and file
+server creation (`fs cluster add`) into one command. The commands can be
+invoked separately if desired. The following assumes the configuration files
+are in the current working directory.

 ```shell
-SHIPYARD_CONFIGDIR=. ./shipyard fs disks add
-```
-
-After the managed disks have been created, then you can create the cluster
-with:
-
-```shell
-SHIPYARD_CONFIGDIR=. ./shipyard fs cluster add mystoragecluster
+SHIPYARD_CONFIGDIR=. ./shipyard fs cluster orchestrate mystoragecluster
 ```

 This assumes that the storage cluster id is `mystoragecluster`. After the
--- a/recipes/RemoteFS-GlusterFS/config/fs.yaml
+++ b/recipes/RemoteFS-GlusterFS/config/fs.yaml
@ -2,7 +2,7 @@ remote_fs:
  resource_group: my-resource-group
  location: <Azure region, e.g., eastus>
  managed_disks:
-    premium: true
+    sku: premium_lrs
    disk_size_gb: 1023
    disk_names:
    - p30-disk0a
--- a/recipes/RemoteFS-NFS/README.md
+++ b/recipes/RemoteFS-NFS/README.md
@ -40,20 +40,15 @@ here.
 is only a single VM, thus all disks should be mapped in the `"0"` entry.

 ### Commands to create the NFS file server
-After modifying the configuration files as required, then you must create
-the managed disks as the first step. The following assumes the configuration
-files are in the current working directory. First all of the managed disks
-used by the file server must be provisioned:
+After modifying the configuration files as required, you can orchestrate
+the entire NFS file server with `fs cluster orchestrate`. The `orchestrate`
+command wraps up the disk allocation (`fs disks add`) and file server
+creation (`fs cluster add`) into one command. The commands can be invoked
+separately if desired. The following assumes the configuration files are in
+the current working directory.

 ```shell
-SHIPYARD_CONFIGDIR=. ./shipyard fs disks add
-```
-
-After the managed disks have been created, then you can create the cluster
-with:
-
-```shell
-SHIPYARD_CONFIGDIR=. ./shipyard fs cluster add mystoragecluster
+SHIPYARD_CONFIGDIR=. ./shipyard fs cluster orchestrate mystoragecluster
 ```

 This assumes that the storage cluster id is `mystoragecluster`. After the
--- a/recipes/Slurm+NFS/README.md
+++ b/recipes/Slurm+NFS/README.md
@ -0,0 +1,100 @@
+# Slurm+NFS
+This recipe shows how to orchestrate a Slurm on Batch cluster with a single
+Batch pool providing compute node VMs for Slurm workloads along with a shared
+NFS filesystem.
+
+## Configuration
+Please see refer to this [set of sample configuration files](./config) for
+this recipe.
+
+### Credentials Configuration
+The credentials configuration should have `management` Azure Active Directory
+credentials defined along with a valid storage account. The `management`
+section can be supplied through environment variables instead if preferred.
+The `batch` section should also be populated which associates all of the
+Batch pools used by Slurm partitions. Additionally, a `slurm` section with the
+`db_password` must be defined.
+
+### Pool Configuration
+The pool configuration can be modified as necessary for the requisite OS
+and other tooling that should be installed. The `vm_count` should be kept
+as `0` for both `dedicated` and `low_priority` during the initial allocation
+as Slurm's elastic cloud bursting will size the pools appropriately.
+
+### FS Configuration
+The remote fs configuration file requires modification. Properties to
+modify are:
+* `resource_group` all resource groups should be modified to fit your
+scenario.
+* `location` should be modified to the Azure region where you would like
+the storage cluster created. If linking against Azure Batch compute nodes,
+it should be in the same region as your Azure Batch account.
+* `managed_disks` should be modified for the number, size and type of
+managed disks to allocate for the file server.
+* `storage_clusters` should be modified to have a unique name instead of
+`mystoragecluster` if you prefer.
+* `hostname_prefix` should be modified to your perferred resource name
+prefix.
+* `virtual_network` should be modified for the address prefixes and subnet
+properties that you prefer.
+* `network_security` should be modified for inbound network security rules
+to apply for SSH and external NFSv4 client mounts. If no NFSv4 clients
+external to the virtual network are needed, then the entire `nfs` security
+rule can be omitted.
+* `file_server` options such as `mountpoint` and `mount_options` should be
+modified to your scenario. Type should not be modified from `nfs`.
+* `vm_size` can be modified for the file server depending upon your scenario.
+If using premium managed disks, then a premium VM size must be selected
+here.
+* `vm_disk_map` contains all of the disks used for each VM. For `nfs`, there
+is only a single VM, thus all disks should be mapped in the `"0"` entry.
+
+### Slurm Configuration
+The Slurm configuration should include the appropriate location and virtual
+network settings for the controller and login nodes, in addition to defining
+the appropriate elastic partitions. Please see the
+[Slurm on Batch](../../docs/69-batch-shipyard-slurm.md) guide and the
+[Slurm configuration](../../docs/18-batch-shipayrd-slurm.md) document for more
+information on each option.
+
+### Commands to orchestrate the Slurm cluster
+After modifying the configuration files as required, you can orchestrate
+the entire Slurm cluster creation with `slurm cluster orchestrate`. The
+`orchestrate` command wraps up the NFS disk allocation (`fs disks add`), NFS
+file server creation (`fs cluster add`), Batch pool allocation (`pool add`),
+and Slurm controller/login creation (`slurm cluster create`) into one command.
+The commands can be invoked separately if desired. The following assumes the
+configuration files are in the current working directory.
+
+```shell
+# ensure all configuration files are in the appropriate directory
+export SHIPYARD_CONFIGDIR=.
+
+# orchestrate the Slurm cluster
+./shipyard slurm cluster orchestrate --storage-cluster-id nfs -y
+```
+
+You can log into the login nodes by issuing the command:
+
+```shell
+./shipyard slurm ssh login
+```
+
+which will default to logging into the first login node (since this cluster
+only has one login node, it is the only possible node to log in to).
+
+There you will be able to run your Slurm commands such as `sbatch`, `squeue`,
+`salloc`, `srun`, etc..
+
+To delete the Slurm cluster:
+
+```shell
+# delete the Batch pool providing Slurm compute nodes
+./shipyard pool del -y
+
+# delete the Slurm controller and login nodes
+./shipyard slurm cluster destroy -y
+
+# delete the RemoteFS shared file system
+./shipyard fs cluster del nfs -y --delete-data-disks
+```
--- a/recipes/Slurm+NFS/config/config.yaml
+++ b/recipes/Slurm+NFS/config/config.yaml
@ -0,0 +1,8 @@
+batch_shipyard:
+  storage_account_settings: mystorageaccount
+global_resources:
+  volumes:
+    shared_data_volumes:
+      nfs:
+        container_path: $AZ_BATCH_NODE_SHARED_DIR/nfs # this is not used
+        volume_driver: storage_cluster
--- a/recipes/Slurm+NFS/config/credentials.yaml
+++ b/recipes/Slurm+NFS/config/credentials.yaml
@ -0,0 +1,37 @@
+credentials:
+  batch:
+    aad:
+      endpoint: https://batch.core.windows.net/
+      directory_id: <AAD directory id>
+      application_id: <AAD application id, if using SP login>
+      auth_key: <Auth key for SP login>
+      rsa_private_key_pem: <RSA private key for SP login>
+      x509_cert_sha1_thumbprint: <X509 cert thumbprint for SP login>
+      user: <AAD username for directory if using user login>
+      password: <AAD password for username above if using user login without multi-factor auth>
+      token_cache:
+        enabled: true
+        filename: .aad_token_cache
+    account_service_url: https://<myaccount>.<region>.batch.azure.com
+    resource_group: my-resource-group
+  management:
+    aad:
+      endpoint: https://management.azure.com/
+      directory_id: <AAD directory id>
+      application_id: <AAD application id, if using SP login>
+      auth_key: <Auth key for SP login>
+      rsa_private_key_pem: <RSA private key for SP login>
+      x509_cert_sha1_thumbprint: <X509 cert thumbprint for SP login>
+      user: <AAD username for directory if using user login>
+      password: <AAD password for username above if using user login without multi-factor auth>
+      token_cache:
+        enabled: true
+        filename: .aad_token_cache
+    subscription_id: <subscription id>
+  storage:
+    mystorageaccount:
+      account: <storage account name>
+      account_key: <storage account key>
+      endpoint: core.windows.net
+  slurm:
+    db_password: <slurm db pass>
--- a/recipes/Slurm+NFS/config/fs.yaml
+++ b/recipes/Slurm+NFS/config/fs.yaml
@ -0,0 +1,49 @@
+remote_fs:
+  resource_group: my-resource-group
+  location: <Azure region, e.g., eastus>
+  managed_disks:
+    sku: premium_lrs
+    disk_size_gb: 1023
+    disk_names:
+    - p30-disk0a
+    - p30-disk1a
+  storage_clusters:
+    nfs:
+      hostname_prefix: mystoragecluster
+      ssh:
+        username: shipyard
+      file_server:
+        server_options:
+          '*':
+            - rw
+            - no_root_squash
+            - no_subtree_check
+        mount_options:
+        - noatime
+        - nodiratime
+        mountpoint: /shared
+        type: nfs
+      network_security:
+        nfs:
+        - <ip address prefix in cidr notation for allowable external clients>
+        ssh:
+        - '*'
+      virtual_network:
+        address_space: 10.0.0.0/8
+        existing_ok: true
+        name: myvnet
+        subnet:
+          address_prefix: 10.0.1.0/24
+          name: my-nfs-server-subnet
+      public_ip:
+        enabled: true
+        static: false
+      vm_count: 1
+      vm_size: STANDARD_F16S
+      vm_disk_map:
+        '0':
+          disk_array:
+          - p30-disk0a
+          - p30-disk1a
+          filesystem: btrfs
+          raid_level: 0
--- a/recipes/Slurm+NFS/config/pool.yaml
+++ b/recipes/Slurm+NFS/config/pool.yaml
@ -0,0 +1,18 @@
+pool_specification:
+  id: slurmpool
+  virtual_network:
+    name: myvnet
+    resource_group: my-resource-group
+    address_space: 10.0.0.0/8
+    subnet:
+      name: batch-nodes
+      address_prefix: 10.1.0.0/16
+  vm_configuration:
+    platform_image:
+      offer: UbuntuServer
+      publisher: Canonical
+      sku: 18.04-LTS
+  vm_count:
+    dedicated: 0
+    low_priority: 0
+  vm_size: STANDARD_F2
--- a/recipes/Slurm+NFS/config/slurm.yaml
+++ b/recipes/Slurm+NFS/config/slurm.yaml
@ -0,0 +1,55 @@
+slurm:
+  storage_account_settings: mystorageaccount
+  resource_group: my-resource-group
+  location: <Azure region, e.g., eastus>
+  cluster_id: myslurmcluster
+  controller:
+    ssh:
+      username: shipyardadmin
+    public_ip:
+      enabled: true
+    virtual_network:
+      address_space: 10.0.0.0/8
+      existing_ok: true
+      name: myvnet
+      subnet:
+        address_prefix: 10.0.2.0/24
+        name: slurm-controller-subnet
+    network_security:
+      ssh:
+        - '*'
+    vm_size: STANDARD_D2_V2
+    vm_count: 1
+  login:
+    ssh:
+      username: shipyard
+    public_ip:
+      enabled: true
+    virtual_network:
+      address_space: 10.0.0.0/8
+      existing_ok: true
+      name: myvnet
+      subnet:
+        address_prefix: 10.0.3.0/24
+        name: slurm-login-subnet
+    network_security:
+      ssh:
+        - '*'
+    vm_size: STANDARD_D2_V2
+    vm_count: 1
+  shared_data_volumes:
+    nfs:
+      host_mount_path: /shared
+      store_slurmctld_state: true
+  slurm_options:
+    idle_reclaim_time: 00:30:00
+    elastic_partitions:
+      mypart1:
+        batch_pools:
+          slurmpool:
+            account_service_url: https://<myaccount>.<region>.batch.azure.com
+            compute_node_type: dedicated
+            max_compute_nodes: 32
+            weight: 0
+            reclaim_exclude_num_nodes: 4
+        default: true
--- a/schemas/credentials.yaml
+++ b/schemas/credentials.yaml
@ -231,3 +231,9 @@ mapping:
                    type: str
                  password_keyvault_secret_id:
                    type: str
+      slurm:
+        type: map
+        mapping:
+          db_password:
+            type: text
+            required: true
--- a/schemas/slurm.yaml
+++ b/schemas/slurm.yaml
@ -0,0 +1,268 @@
+desc: Slurm Configuration Schema
+
+type: map
+mapping:
+  slurm:
+    type: map
+    mapping:
+      storage_account_settings:
+        type: str
+        required: true
+      location:
+        type: str
+        required: true
+      resource_group:
+        type: str
+        required: true
+      cluster_id:
+        type: str
+        required: true
+      controller:
+        type: map
+        required: true
+        mapping:
+          ssh:
+            type: map
+            required: true
+            mapping:
+              username:
+                type: str
+                required: true
+              ssh_public_key:
+                type: str
+              ssh_public_key_data:
+                type: str
+              ssh_private_key:
+                type: str
+              generated_file_export_path:
+                type: str
+          public_ip:
+            type: map
+            mapping:
+              enabled:
+                type: bool
+              static:
+                type: bool
+          virtual_network:
+            type: map
+            required: true
+            mapping:
+              name:
+                type: str
+                required: true
+              resource_group:
+                type: str
+              existing_ok:
+                type: bool
+              address_space:
+                type: str
+              subnet:
+                type: map
+                mapping:
+                  name:
+                    type: str
+                    required: true
+                  address_prefix:
+                    type: str
+                    required: true
+          network_security:
+            type: map
+            required: true
+            mapping:
+              ssh:
+                type: seq
+                required: true
+                sequence:
+                  - type: str
+              custom_inbound_rules:
+                type: map
+                mapping:
+                  regex;([a-zA-Z0-9]+):
+                    type: map
+                    mapping:
+                      destination_port_range:
+                        type: str
+                        required: true
+                      protocol:
+                        type: str
+                        enum: ['*', 'tcp', 'udp']
+                      source_address_prefix:
+                        type: seq
+                        required: true
+                        sequence:
+                          - type: str
+          vm_size:
+            type: str
+            required: true
+          vm_count:
+            type: int
+            required: true
+            range:
+              min: 1
+              max: 3
+          accelerated_networking:
+            type: bool
+          additional_prep_script:
+            type: str
+      login:
+        type: map
+        required: true
+        mapping:
+          ssh:
+            type: map
+            required: true
+            mapping:
+              username:
+                type: str
+                required: true
+              ssh_public_key:
+                type: str
+              ssh_public_key_data:
+                type: str
+              ssh_private_key:
+                type: str
+              generated_file_export_path:
+                type: str
+          public_ip:
+            type: map
+            mapping:
+              enabled:
+                type: bool
+              static:
+                type: bool
+          virtual_network:
+            type: map
+            required: true
+            mapping:
+              name:
+                type: str
+                required: true
+              resource_group:
+                type: str
+              existing_ok:
+                type: bool
+              address_space:
+                type: str
+              subnet:
+                type: map
+                mapping:
+                  name:
+                    type: str
+                    required: true
+                  address_prefix:
+                    type: str
+                    required: true
+          network_security:
+            type: map
+            required: true
+            mapping:
+              ssh:
+                type: seq
+                required: true
+                sequence:
+                  - type: str
+              custom_inbound_rules:
+                type: map
+                mapping:
+                  regex;([a-zA-Z0-9]+):
+                    type: map
+                    mapping:
+                      destination_port_range:
+                        type: str
+                        required: true
+                      protocol:
+                        type: str
+                        enum: ['*', 'tcp', 'udp']
+                      source_address_prefix:
+                        type: seq
+                        required: true
+                        sequence:
+                          - type: str
+          vm_size:
+            type: str
+            required: true
+          vm_count:
+            type: int
+            required: true
+            range:
+              min: 1
+          accelerated_networking:
+            type: bool
+          additional_prep_script:
+            type: str
+      shared_data_volumes:
+        type: map
+        required: true
+        mapping:
+          regex;([a-zA-Z0-9]+):
+            type: map
+            required: true
+            mapping:
+              host_mount_path:
+                type: str
+                required: true
+              store_slurmctld_state:
+                type: bool
+                required: true
+      slurm_options:
+        type: map
+        required: true
+        mapping:
+          idle_reclaim_time:
+            type: str
+          elastic_partitions:
+            type: map
+            required: true
+            mapping:
+              regex;([a-zA-Z0-9]+):
+                type: map
+                mapping:
+                  batch_pools:
+                    type: map
+                    required: true
+                    mapping:
+                      regex;([a-zA-Z0-9]+):
+                        type: map
+                        mapping:
+                          account_service_url:
+                            type: str
+                          compute_node_type:
+                            type: str
+                            required: true
+                            enum: ['dedicated', 'low_priority']
+                          max_compute_nodes:
+                            type: int
+                            required: true
+                            range:
+                              min: 1
+                          weight:
+                            type: int
+                            required: true
+                            range:
+                              min: 0
+                          features:
+                            type: seq
+                            sequence:
+                              - type: text
+                          reclaim_exclude_num_nodes:
+                            type: int
+                            range:
+                              min: 0
+                  max_runtime_limit:
+                    type: str
+                  default:
+                    type: bool
+                    required: true
+          unmanaged_partitions:
+            type: seq
+            sequence:
+              - type: map
+                mapping:
+                  partition:
+                    type: str
+                    required: true
+                  nodes:
+                    type: seq
+                    required: true
+                    sequence:
+                      - type: str
--- a/scripts/shipyard_federation_bootstrap.sh
+++ b/scripts/shipyard_federation_bootstrap.sh
@ -6,10 +6,10 @@ set -e
 set -o pipefail

 # version consts
-DOCKER_CE_VERSION_DEBIAN=18.03.1
+DOCKER_CE_VERSION_DEBIAN=18.09.1

 # consts
-DOCKER_CE_PACKAGE_DEBIAN="docker-ce=${DOCKER_CE_VERSION_DEBIAN}~ce~3-0~"
+DOCKER_CE_PACKAGE_DEBIAN="docker-ce=5:${DOCKER_CE_VERSION_DEBIAN}~3-0~"
 SHIPYARD_VAR_DIR=/var/batch-shipyard
 SHIPYARD_CONF_FILE=${SHIPYARD_VAR_DIR}/federation.json

--- a/scripts/shipyard_monitoring_bootstrap.sh
+++ b/scripts/shipyard_monitoring_bootstrap.sh
@ -6,10 +6,10 @@ set -e
 set -o pipefail

 # version consts
-DOCKER_CE_VERSION_DEBIAN=18.03.1
+DOCKER_CE_VERSION_DEBIAN=18.09.1

 # consts
-DOCKER_CE_PACKAGE_DEBIAN="docker-ce=${DOCKER_CE_VERSION_DEBIAN}~ce~3-0~"
+DOCKER_CE_PACKAGE_DEBIAN="docker-ce=5:${DOCKER_CE_VERSION_DEBIAN}~3-0~"
 SHIPYARD_VAR_DIR=/var/batch-shipyard
 SHIPYARD_CONF_FILE=${SHIPYARD_VAR_DIR}/heimdall.json
 PROMETHEUS_VAR_DIR=${SHIPYARD_VAR_DIR}/prometheus
--- a/scripts/shipyard_nodeprep.sh
+++ b/scripts/shipyard_nodeprep.sh
@ -943,8 +943,9 @@ install_kata_containers() {

 process_fstab_entry() {
    local desc=$1
-    local mountpoint=$2
-    local fstab_entry=$3
+    local fstab_entry=$2
+    IFS=' ' read -ra fs <<< "$fstab_entry"
+    local mountpoint="${fs[1]}"
    log INFO "Creating host directory for $desc at $mountpoint"
    mkdir -p "$mountpoint"
    chmod 777 "$mountpoint"
@ -976,15 +977,14 @@ process_fstab_entry() {
 }

 mount_storage_clusters() {
-    if [ -n "$sc_args" ]; then
+    if [ -n "$SHIPYARD_STORAGE_CLUSTER_FSTAB" ]; then
        log DEBUG "Mounting storage clusters"
-        # eval and split fstab var to expand vars (this is ok since it is set by shipyard)
-        fstab_mounts=$(eval echo "$SHIPYARD_STORAGE_CLUSTER_FSTAB")
-        IFS='#' read -ra fstabs <<< "$fstab_mounts"
-        i=0
-        for sc_arg in "${sc_args[@]}"; do
-            IFS=':' read -ra sc <<< "$sc_arg"
-            mount "${MOUNTS_PATH}"/"${sc[1]}"
+        IFS='#' read -ra fstab_mounts <<< "$SHIPYARD_STORAGE_CLUSTER_FSTAB"
+        for fstab in "${fstab_mounts[@]}"; do
+            # eval and split fstab var to expand vars
+            fstab_entry=$(eval echo "$fstab")
+            IFS=' ' read -ra parts <<< "$fstab_entry"
+            mount "${parts[1]}"
        done
        log INFO "Storage clusters mounted"
    fi
@ -1000,10 +1000,10 @@ process_storage_clusters() {
        for sc_arg in "${sc_args[@]}"; do
            IFS=':' read -ra sc <<< "$sc_arg"
            fstab_entry="${fstabs[$i]}"
-            process_fstab_entry "$sc_arg" "$MOUNTS_PATH/${sc[1]}" "$fstab_entry"
+            process_fstab_entry "$sc_arg" "$fstab_entry"
            i=$((i + 1))
        done
-        log INFO "Storage clusters mounted"
+        log INFO "Storage clusters processed"
    fi
 }

@ -1029,9 +1029,9 @@ process_custom_fstab() {
            # eval and split fstab var to expand vars
            fstab_entry=$(eval echo "$fstab")
            IFS=' ' read -ra parts <<< "$fstab_entry"
-            process_fstab_entry "${parts[2]}" "${parts[1]}" "$fstab_entry"
+            process_fstab_entry "${parts[2]}" "$fstab_entry"
        done
-        log INFO "Custom mounts via fstab mounted"
+        log INFO "Custom mounts via fstab processed"
    fi
 }

@ -1637,26 +1637,9 @@ elif [ -f "$nodeprepfinished" ]; then
    install_and_start_node_exporter
    install_and_start_cadvisor
    # mount any storage clusters
-    if [ -n "$sc_args" ]; then
-        # eval and split fstab var to expand vars (this is ok since it is set by shipyard)
-        fstab_mounts=$(eval echo "$SHIPYARD_STORAGE_CLUSTER_FSTAB")
-        IFS='#' read -ra fstabs <<< "$fstab_mounts"
-        i=0
-        for sc_arg in "${sc_args[@]}"; do
-            IFS=':' read -ra sc <<< "$sc_arg"
-            mount "${MOUNTS_PATH}"/"${sc[1]}"
-        done
-    fi
+    mount_storage_clusters
    # mount any custom mounts
-    if [ -n "$SHIPYARD_CUSTOM_MOUNTS_FSTAB" ]; then
-        IFS='#' read -ra fstab_mounts <<< "$SHIPYARD_CUSTOM_MOUNTS_FSTAB"
-        for fstab in "${fstab_mounts[@]}"; do
-            # eval and split fstab var to expand vars
-            fstab_entry=$(eval echo "$fstab")
-            IFS=' ' read -ra parts <<< "$fstab_entry"
-            mount "${parts[1]}"
-        done
-    fi
+    mount_custom_fstab
    # mount glusterfs on compute volumes
    if [ $gluster_on_compute -eq 1 ]; then
        if [ $custom_image -eq 1 ]; then
--- a/scripts/shipyard_slurm_computenode_nodeprep.sh
+++ b/scripts/shipyard_slurm_computenode_nodeprep.sh
@ -0,0 +1,576 @@
+#!/usr/bin/env bash
+
+# shellcheck disable=SC1091
+
+set -e
+set -o pipefail
+
+# version consts
+SLURM_VERSION=18.08.5-2
+
+# consts
+SLURM_PACKAGE_DEBIAN="slurm-${SLURM_VERSION}_1.0_amd64"
+SLURM_PACKAGE_CENTOS="slurm-${SLURM_VERSION}-1.0-1.x86_64"
+SLURM_CONF_DIR=/etc/slurm
+AZFILE_MOUNT_DIR=/azfile-slurm
+SHIPYARD_VAR_DIR=/var/batch-shipyard
+SHIPYARD_CONF_FILE=${SHIPYARD_VAR_DIR}/slurm.json
+SHIPYARD_HOST_FILE=${SHIPYARD_VAR_DIR}/slurm_host
+SHIPYARD_COMPLETED_ASSIGNMENT_FILE=${SHIPYARD_VAR_DIR}/slurm_host.assigned
+SHIPYARD_PROVISION_FAILED_FILE=${SHIPYARD_VAR_DIR}/slurm_host.failed
+HOSTNAME=$(hostname -s)
+HOSTNAME=${HOSTNAME,,}
+IP_ADDRESS=$(ip addr list eth0 | grep "inet " | cut -d' ' -f6 | cut -d/ -f1)
+
+log() {
+    local level=$1
+    shift
+    echo "$(date -u -Ins) - $level - $*"
+}
+
+# dump uname immediately
+uname -ar
+
+# try to get distrib vars
+if [ -e /etc/os-release ]; then
+    . /etc/os-release
+    DISTRIB_ID=$ID
+    DISTRIB_RELEASE=$VERSION_ID
+    DISTRIB_CODENAME=$VERSION_CODENAME
+    if [ -z "$DISTRIB_CODENAME" ]; then
+        if [ "$DISTRIB_ID" == "debian" ] && [ "$DISTRIB_RELEASE" == "9" ]; then
+            DISTRIB_CODENAME=stretch
+        fi
+    fi
+else
+    if [ -e /etc/lsb-release ]; then
+        . /etc/lsb-release
+    fi
+fi
+if [ -z "${DISTRIB_ID+x}" ] || [ -z "${DISTRIB_RELEASE+x}" ]; then
+    log ERROR "Unknown DISTRIB_ID or DISTRIB_RELEASE."
+    exit 1
+fi
+if [ -z "${DISTRIB_CODENAME}" ]; then
+    log WARNING "Unknown DISTRIB_CODENAME."
+fi
+DISTRIB_ID=${DISTRIB_ID,,}
+DISTRIB_RELEASE=${DISTRIB_RELEASE,,}
+DISTRIB_CODENAME=${DISTRIB_CODENAME,,}
+
+# set distribution specific vars
+PACKAGER=
+PACKAGE_SUFFIX=
+SLURM_PACKAGE=
+if [ "$DISTRIB_ID" == "ubuntu" ]; then
+    PACKAGER=apt
+    PACKAGE_SUFFIX=deb
+    SLURM_PACKAGE="${SLURM_PACKAGE_DEBIAN}.${PACKAGE_SUFFIX}"
+elif [ "$DISTRIB_ID" == "debian" ]; then
+    PACKAGER=apt
+    PACKAGE_SUFFIX=deb
+    SLURM_PACKAGE="${SLURM_PACKAGE_DEBIAN}.${PACKAGE_SUFFIX}"
+elif [[ $DISTRIB_ID == centos* ]] || [ "$DISTRIB_ID" == "rhel" ]; then
+    PACKAGER=yum
+    PACKAGE_SUFFIX=rpm
+    SLURM_PACKAGE="${SLURM_PACKAGE_CENTOS}.${PACKAGE_SUFFIX}"
+else
+    PACKAGER=zypper
+    PACKAGE_SUFFIX=rpm
+    SLURM_PACKAGE="${SLURM_PACKAGE_CENTOS}.${PACKAGE_SUFFIX}"
+fi
+if [ "$PACKAGER" == "apt" ]; then
+    export DEBIAN_FRONTEND=noninteractive
+fi
+
+# globals
+aad_cloud=
+cluster_id=
+cluster_user=
+queue_assign=
+storage_account=
+storage_key=
+storage_ep=
+storage_prefix=
+shipyardversion=
+
+# process command line options
+while getopts "h?a:i:q:s:u:v:" opt; do
+    case "$opt" in
+        h|\?)
+            echo "shipyard_slurm_computenode_bootstrap.sh parameters"
+            echo ""
+            echo "-a [aad cloud type] AAD cloud type for MSI"
+            echo "-i [id] cluster id"
+            echo "-q [assign] queue names"
+            echo "-s [storage account:storage key:storage ep:prefix] storage config"
+            echo "-u [user] cluster username"
+            echo "-v [version] batch-shipyard version"
+            echo ""
+            exit 1
+            ;;
+        a)
+            aad_cloud=${OPTARG,,}
+            ;;
+        i)
+            cluster_id=${OPTARG}
+            ;;
+        q)
+            queue_assign=${OPTARG}
+            ;;
+        s)
+            IFS=':' read -ra ss <<< "${OPTARG}"
+            storage_account=${ss[0]}
+            storage_key=${ss[1]}
+            storage_ep=${ss[2]}
+            storage_prefix=${ss[3]}
+            ;;
+        u)
+            cluster_user=${OPTARG}
+            ;;
+        v)
+            shipyardversion=$OPTARG
+            ;;
+    esac
+done
+shift $((OPTIND-1))
+[ "$1" = "--" ] && shift
+# check required params
+if [ -z "$aad_cloud" ]; then
+    log ERROR "AAD cloud type not specified"
+    exit 1
+fi
+
+check_for_buggy_ntfs_mount() {
+    # Check to ensure sdb1 mount is not mounted as ntfs
+    set +e
+    mount | grep /dev/sdb1 | grep fuseblk
+    local rc=$?
+    set -e
+    if [ $rc -eq 0 ]; then
+        log ERROR "/dev/sdb1 temp disk is mounted as fuseblk/ntfs"
+        exit 1
+    fi
+}
+
+download_file_as() {
+    log INFO "Downloading: $1 as $2"
+    local retries=10
+    set +e
+    while [ $retries -gt 0 ]; do
+        if curl -fSsL "$1" -o "$2"; then
+            break
+        fi
+        retries=$((retries-1))
+        if [ $retries -eq 0 ]; then
+            log ERROR "Could not download: $1"
+            exit 1
+        fi
+        sleep 1
+    done
+    set -e
+}
+
+add_repo() {
+    local url=$1
+    set +e
+    local retries=120
+    local rc
+    while [ $retries -gt 0 ]; do
+        if [ "$PACKAGER" == "apt" ]; then
+            curl -fSsL "$url" | apt-key add -
+            rc=$?
+        elif [ "$PACKAGER" == "yum" ]; then
+            yum-config-manager --add-repo "$url"
+            rc=$?
+        elif [ "$PACKAGER" == "zypper" ]; then
+            zypper addrepo "$url"
+            rc=$?
+        fi
+        if [ $rc -eq 0 ]; then
+            break
+        fi
+        retries=$((retries-1))
+        if [ $retries -eq 0 ]; then
+            log ERROR "Could not add repo: $url"
+            exit 1
+        fi
+        sleep 1
+    done
+    set -e
+}
+
+refresh_package_index() {
+    set +e
+    local retries=120
+    local rc
+    while [ $retries -gt 0 ]; do
+        if [ "$PACKAGER" == "apt" ]; then
+            apt-get update
+            rc=$?
+        elif [ "$PACKAGER" == "yum" ]; then
+            yum makecache -y fast
+            rc=$?
+        elif [ "$PACKAGER" == "zypper" ]; then
+            zypper -n --gpg-auto-import-keys ref
+            rc=$?
+        fi
+        if [ $rc -eq 0 ]; then
+            break
+        fi
+        retries=$((retries-1))
+        if [ $retries -eq 0 ]; then
+            log ERROR "Could not update package index"
+            exit 1
+        fi
+        sleep 1
+    done
+    set -e
+}
+
+install_packages() {
+    set +e
+    local retries=120
+    local rc
+    while [ $retries -gt 0 ]; do
+        if [ "$PACKAGER" == "apt" ]; then
+            apt-get install -y -q -o Dpkg::Options::="--force-confnew" --no-install-recommends "$@"
+            rc=$?
+        elif [ "$PACKAGER" == "yum" ]; then
+            yum install -y "$@"
+            rc=$?
+        elif [ "$PACKAGER" == "zypper" ]; then
+            zypper -n in "$@"
+            rc=$?
+        fi
+        if [ $rc -eq 0 ]; then
+            break
+        fi
+        retries=$((retries-1))
+        if [ $retries -eq 0 ]; then
+            log ERROR "Could not install packages ($PACKAGER): $*"
+            exit 1
+        fi
+        sleep 1
+    done
+    set -e
+}
+
+install_local_packages() {
+    set +e
+    local retries=120
+    local rc
+    while [ $retries -gt 0 ]; do
+        if [ "$PACKAGER" == "apt" ]; then
+            dpkg -i "$@"
+            rc=$?
+        else
+            rpm -Uvh --nodeps "$@"
+            rc=$?
+        fi
+        if [ $rc -eq 0 ]; then
+            break
+        fi
+        retries=$((retries-1))
+        if [ $retries -eq 0 ]; then
+            log ERROR "Could not install local packages: $*"
+            exit 1
+        fi
+        sleep 1
+    done
+    set -e
+}
+
+start_and_check_slurmd() {
+    local retries=120
+    local rc
+    set +e
+    systemctl start slurmd
+    while [ $retries -gt 0 ]; do
+        if systemctl --no-pager status slurmd; then
+            break
+        fi
+        sleep 1
+        retries=$((retries-1))
+        if [ $retries -eq 0 ]; then
+            log ERROR "slurmd could not start properly"
+            exit 1
+        fi
+        systemctl restart slurmd
+    done
+    set -e
+}
+
+create_batch_shipyard_slurm_config() {
+    mkdir -p ${SHIPYARD_VAR_DIR}
+    chmod 755 ${SHIPYARD_VAR_DIR}
+cat > ${SHIPYARD_CONF_FILE} << EOF
+{
+    "aad_cloud": "$aad_cloud",
+    "storage": {
+        "account": "$storage_account",
+        "account_key": "$storage_key",
+        "endpoint": "$storage_ep",
+        "entity_prefix": "$storage_prefix",
+        "queues": {
+            "assign": "$queue_assign"
+        },
+        "azfile_mount_dir": "$AZFILE_MOUNT_DIR"
+    },
+    "cluster_id": "$cluster_id",
+    "cluster_user": "$cluster_user",
+    "ip_address": "$IP_ADDRESS",
+    "logging_id": "$AZ_BATCH_NODE_ID",
+    "batch": {
+        "account": "$AZ_BATCH_ACCOUNT_NAME",
+        "pool_id": "$AZ_BATCH_POOL_ID",
+        "node_id": "$AZ_BATCH_NODE_ID",
+        "is_dedicated": "$AZ_BATCH_NODE_IS_DEDICATED"
+    },
+    "batch_shipyard": {
+        "var_path": "$SHIPYARD_VAR_DIR",
+        "version": "$shipyardversion"
+    }
+}
+EOF
+    chmod 600 "$SHIPYARD_CONF_FILE"
+    log INFO "Batch Shipyard slurm config created"
+}
+
+check_provisioning_status() {
+    local host=$1
+    local reset_host=$2
+    set +e
+    docker run --rm -v "${SHIPYARD_VAR_DIR}:${SHIPYARD_VAR_DIR}:ro" \
+        "alfpark/batch-shipyard:${shipyardversion}-slurm" \
+        check-provisioning-status --conf "${SHIPYARD_CONF_FILE}" \
+        --host "$1"
+    rc=$?
+    set -e
+    if [ $rc -ne 0 ]; then
+        log ERROR "Provisioning interrupt detected for host $1"
+        if [ "$reset_host" -eq 1 ] && [ ! -s "$SHIPYARD_PROVISION_FAILED_FILE" ]; then
+            host="${host}-$RANDOM"
+            log DEBUG "Resetting host name to avoid collision: $host"
+            hostnamectl set-hostname "${host}"
+            hostnamectl status
+            log DEBUG "Rebooting for hostname propagation to DNS"
+            touch "$SHIPYARD_PROVISION_FAILED_FILE"
+            shutdown -r now
+        fi
+        exit $rc
+    fi
+}
+
+log INFO "Bootstrap start"
+echo "Configuration:"
+echo "--------------"
+echo "OS Distribution: $DISTRIB_ID $DISTRIB_RELEASE $DISTRIB_CODENAME"
+echo "Hostname: $HOSTNAME"
+echo "IP Address: $IP_ADDRESS"
+echo "Batch Shipyard Version: $shipyardversion"
+echo "AAD cloud: $aad_cloud"
+echo "Storage: $storage_account:$storage_prefix"
+echo "Cluster Id: $cluster_id"
+echo "Cluster user: $cluster_user"
+echo "Assign queue: $queue_assign"
+echo ""
+
+# check sdb1 mount
+check_for_buggy_ntfs_mount
+
+# set sudoers to not require tty
+sed -i 's/^Defaults[ ]*requiretty/# Defaults requiretty/g' /etc/sudoers
+
+# if provisioning failed previously, don't proceed further
+if [ -s "$SHIPYARD_PROVISION_FAILED_FILE" ]; then
+    log ERROR "Slurm host provisioning failed."
+    exit 1
+fi
+
+# post-reboot token push steps
+if [ -s "$SHIPYARD_HOST_FILE" ]; then
+    log INFO "Host assignment file found. Assuming reboot was successful."
+    hostnamectl status
+
+    # slurmd is manually started since storage clusters are manually mounted
+    # check slurmd in a loop, sometimes it can fail starting due to GPU not ready
+    start_and_check_slurmd
+
+    # update host entity with batch node id and ip address
+    if [ ! -s "$SHIPYARD_COMPLETED_ASSIGNMENT_FILE" ]; then
+        host=$(<${SHIPYARD_HOST_FILE})
+        log DEBUG "Host from hostfile is: $host"
+        check_provisioning_status "$host" 1
+
+        docker run --rm -v "${SHIPYARD_CONF_FILE}:${SHIPYARD_CONF_FILE}:ro" \
+            -v "${AZFILE_MOUNT_DIR}:${AZFILE_MOUNT_DIR}:rw" \
+            "alfpark/batch-shipyard:${shipyardversion}-slurm" \
+            complete-node-assignment --conf "${SHIPYARD_CONF_FILE}" \
+            --host "$host"
+        touch "$SHIPYARD_COMPLETED_ASSIGNMENT_FILE"
+    fi
+    log INFO "Bootstrap completed"
+    exit 0
+fi
+
+# write batch shipyard config
+create_batch_shipyard_slurm_config
+
+echo "Fetching host assignment"
+docker run --rm -v "${SHIPYARD_VAR_DIR}:${SHIPYARD_VAR_DIR}:rw" \
+    -v "${AZFILE_MOUNT_DIR}:${AZFILE_MOUNT_DIR}:rw" \
+    "alfpark/batch-shipyard:${shipyardversion}-slurm" \
+    get-node-assignment --conf "${SHIPYARD_CONF_FILE}"
+host=$(<${SHIPYARD_HOST_FILE})
+echo "Hostname assignment retrieved: $host"
+
+check_provisioning_status "$host" 0
+
+# set cluster user and passwordless SSH for MPI jobs
+echo "Setting up cluster user: ${cluster_user}"
+useradd -o -u 2000 -N -g 1000 -p '!' -s /bin/bash -m -d "/home/${cluster_user}" "${cluster_user}"
+ssh_dir="/home/${cluster_user}/.ssh"
+mkdir -p "$ssh_dir"
+chmod 700 "$ssh_dir"
+echo "$SHIPYARD_SLURM_CLUSTER_USER_SSH_PUBLIC_KEY" > "${ssh_dir}/id_rsa.pub"
+chmod 644 "${ssh_dir}/id_rsa.pub"
+echo "$SHIPYARD_SLURM_CLUSTER_USER_SSH_PUBLIC_KEY" >> "${ssh_dir}/authorized_keys"
+chmod 600 "${ssh_dir}/authorized_keys"
+cat > "${ssh_dir}/config" << EOF
+Host 10.*
+  StrictHostKeyChecking no
+  UserKnownHostsFile /dev/null
+EOF
+chmod 600 "${ssh_dir}/config"
+mv slurm_cluster_user_ssh_private_key "${ssh_dir}/id_rsa"
+chmod 600 "${ssh_dir}/id_rsa"
+chown -R "${cluster_user}:_azbatchgrp" "$ssh_dir"
+echo "Cluster user setup complete"
+
+# add slurm user
+groupadd -g 64030 slurm
+useradd -u 64030 -N -g 64030 -p '!' -s /bin/bash -m -d /home/slurm slurm
+slurm_uid=$(id -u slurm)
+slurm_gid=$(id -g slurm)
+
+# install slurm packages
+if [ "$DISTRIB_ID" == "centos" ]; then
+    install_packages epel-release
+fi
+install_packages hwloc numactl munge
+slurm_docker_image="alfpark/slurm:${SLURM_VERSION}-${DISTRIB_ID}-${DISTRIB_RELEASE}"
+docker pull "$slurm_docker_image"
+mkdir -p /tmp/slurm
+docker run --rm -v /tmp/slurm:/tmp/slurm "$slurm_docker_image" \
+    /bin/sh -c 'cp -r /root/* /tmp/slurm/'
+install_local_packages "/tmp/slurm/${SLURM_PACKAGE}"
+cp /tmp/slurm/slurmd.service /etc/systemd/system/
+rm -rf /tmp/slurm
+docker rmi "$slurm_docker_image"
+mkdir -p "$SLURM_CONF_DIR" /var/spool/slurm /var/log/slurm
+chown -R slurm:slurm /var/spool/slurm /var/log/slurm
+cat << EOF > "/etc/ld.so.conf.d/slurm.conf"
+/usr/lib/slurm
+EOF
+ldconfig
+ldconfig -p | grep libslurmfull
+systemctl daemon-reload
+
+# mount Azure file share
+cat << EOF > "/root/.azfile_creds"
+username=$storage_account
+password=$storage_key
+EOF
+chmod 600 /root/.azfile_creds
+mkdir -p "$AZFILE_MOUNT_DIR"
+chmod 755 "$AZFILE_MOUNT_DIR"
+share="${storage_prefix}slurm"
+echo "//${storage_account}.file.${storage_ep}/${share} ${AZFILE_MOUNT_DIR} cifs vers=3.0,credentials=/root/.azfile_creds,uid=${slurm_uid},gid=${slurm_gid},_netdev,serverino 0 0" >> /etc/fstab
+mount "$AZFILE_MOUNT_DIR"
+
+azfile_cluster_path="${AZFILE_MOUNT_DIR}/${cluster_id}"
+
+# configure munge
+shared_munge_key_path="${azfile_cluster_path}/munge"
+shared_munge_key="${shared_munge_key_path}/munge.key"
+# export munge key to storage
+# poll for munge key
+echo "Waiting for munge key"
+while [ ! -s "$shared_munge_key" ]; do
+    sleep 1
+done
+echo "Munge key found."
+cp -f "$shared_munge_key" /etc/munge/munge.key
+chmod 400 /etc/munge/munge.key
+chown munge:munge /etc/munge/munge.key
+if [ "$DISTRIB_ID" == "centos" ]; then
+    systemctl start munge
+fi
+munge -n | unmunge
+systemctl enable munge
+systemctl restart munge
+systemctl --no-pager status munge
+
+# configure slurm
+mkdir -p /var/spool/slurmd
+chown -R slurm:slurm /var/spool/slurmd
+# construct cgroup conf files
+cat << EOF > "${SLURM_CONF_DIR}/cgroup.conf"
+CgroupAutomount=yes
+ConstrainCores=yes
+ConstrainDevices=yes
+#ConstrainRAMSpace=yes
+EOF
+
+cat << EOF > "${SLURM_CONF_DIR}/cgroup_allowed_devices_file.conf"
+/dev/null
+/dev/urandom
+/dev/zero
+/dev/sda*
+/dev/sdb*
+/dev/cpu/*/*
+/dev/pts/*
+/dev/nvidia*
+/dev/infiniband/*
+EOF
+
+# copy configuration file
+slurm_conf_azfile_path="${azfile_cluster_path}/slurm/conf"
+echo "Waiting for slurm configuration file in $slurm_conf_azfile_path"
+while [ ! -s "${slurm_conf_azfile_path}/slurm.conf" ]; do
+    sleep 1
+done
+echo "Slurm configuration file found."
+cp -f "${slurm_conf_azfile_path}/slurm.conf" "${SLURM_CONF_DIR}/slurm.conf"
+chmod 644 "${SLURM_CONF_DIR}/slurm.conf"
+
+check_provisioning_status "$host" 0
+
+# set hostname, reboot required
+hostnamectl set-hostname "$host"
+hostnamectl status
+
+# construct gres.conf for GPUs
+set +e
+gpus=$(lspci | grep -i nvidia | awk '{print $1}' | cut -d : -f 1)
+set -e
+if [ -n "$gpus" ]; then
+    gres_file="${SLURM_CONF_DIR}/gres.conf"
+    count=0
+    for i in $gpus; do
+        CPUAFFINITY=$(cat /sys/class/pci_bus/"$i":00/cpulistaffinity)
+        echo "NodeName=${host} Name=gpu File=/dev/nvidia${count} CPUs=${CPUAFFINITY}" >> "$gres_file"
+        count=$((count+1))
+    done
+    chmod 644 "$gres_file"
+    chown slurm:slurm "$gres_file"
+fi
+
+log INFO "Rebooting for hostname propagation to DNS"
+shutdown -r now
+
+# TODO add slum pam auth (prevent user from SSHing into a compute node without an allocation)
+#install_packages libpam-slurm
+#echo "" >> /etc/pam.d/sshd
+#echo "account    required    pam_slurm.so" >> /etc/pam.d/sshd
--- a/scripts/shipyard_slurm_master_bootstrap.sh
+++ b/scripts/shipyard_slurm_master_bootstrap.sh
@ -0,0 +1,858 @@
+#!/usr/bin/env bash
+
+# shellcheck disable=SC1039,SC1091,SC2129
+
+set -e
+set -o pipefail
+
+# version consts
+SLURM_VERSION=18.08.5-2
+DOCKER_CE_VERSION_DEBIAN=18.09.2
+GLUSTER_VERSION_DEBIAN=4.1
+GLUSTER_VERSION_CENTOS=41
+
+# consts
+DOCKER_CE_PACKAGE_DEBIAN="docker-ce=5:${DOCKER_CE_VERSION_DEBIAN}~3-0~"
+SLURM_CONF_DIR=/etc/slurm
+AZFILE_MOUNT_DIR=/azfile-slurm
+SHIPYARD_VAR_DIR=/var/batch-shipyard
+SHIPYARD_SLURM_PY=${SHIPYARD_VAR_DIR}/slurm.py
+SHIPYARD_CONF_FILE=${SHIPYARD_VAR_DIR}/slurm.json
+HOSTNAME=$(hostname -s)
+HOSTNAME=${HOSTNAME,,}
+SHIPYARD_STORAGE_CLUSTER_FSTAB=$(<sdv.fstab)
+
+log() {
+    local level=$1
+    shift
+    echo "$(date -u -Ins) - $level - $*"
+}
+
+# dump uname immediately
+uname -ar
+
+# try to get distrib vars
+if [ -e /etc/os-release ]; then
+    . /etc/os-release
+    DISTRIB_ID=$ID
+    DISTRIB_RELEASE=$VERSION_ID
+    DISTRIB_CODENAME=$VERSION_CODENAME
+    if [ -z "$DISTRIB_CODENAME" ]; then
+        if [ "$DISTRIB_ID" == "debian" ] && [ "$DISTRIB_RELEASE" == "9" ]; then
+            DISTRIB_CODENAME=stretch
+        fi
+    fi
+else
+    if [ -e /etc/lsb-release ]; then
+        . /etc/lsb-release
+    fi
+fi
+if [ -z "${DISTRIB_ID+x}" ] || [ -z "${DISTRIB_RELEASE+x}" ]; then
+    log ERROR "Unknown DISTRIB_ID or DISTRIB_RELEASE."
+    exit 1
+fi
+if [ -z "${DISTRIB_CODENAME}" ]; then
+    log WARNING "Unknown DISTRIB_CODENAME."
+fi
+DISTRIB_ID=${DISTRIB_ID,,}
+DISTRIB_RELEASE=${DISTRIB_RELEASE,,}
+DISTRIB_CODENAME=${DISTRIB_CODENAME,,}
+
+# set distribution specific vars
+PACKAGER=
+USER_MOUNTPOINT=/mnt/resource
+SYSTEMD_PATH=/lib/systemd/system
+if [ "$DISTRIB_ID" == "ubuntu" ]; then
+    PACKAGER=apt
+    USER_MOUNTPOINT=/mnt
+elif [ "$DISTRIB_ID" == "debian" ]; then
+    PACKAGER=apt
+elif [[ $DISTRIB_ID == centos* ]] || [ "$DISTRIB_ID" == "rhel" ]; then
+    PACKAGER=yum
+else
+    PACKAGER=zypper
+    SYSTEMD_PATH=/usr/lib/systemd/system
+fi
+if [ "$PACKAGER" == "apt" ]; then
+    export DEBIAN_FRONTEND=noninteractive
+fi
+
+# globals
+aad_cloud=
+cluster_id=
+cluster_name=
+cluster_user=
+controller_primary=
+controller_secondary=
+controller_tertiary=
+is_primary=0
+is_login_node=0
+num_controllers=
+sc_args=
+slurm_state_path=
+storage_account=
+storage_prefix=
+storage_rg=
+shipyardversion=
+
+# process command line options
+while getopts "h?a:c:i:lm:p:s:u:v:" opt; do
+    case "$opt" in
+        h|\?)
+            echo "shipyard_slurm_master_bootstrap.sh parameters"
+            echo ""
+            echo "-a [aad cloud type] AAD cloud type for MSI"
+            echo "-c [primary:secondary:tertiary] controller hosts"
+            echo "-i [id] cluster id"
+            echo "-l is login node"
+            echo "-m [type:scid] mount storage cluster"
+            echo "-p [path] state save path"
+            echo "-s [storage account:resource group:prefix] storage config"
+            echo "-u [user] cluster username"
+            echo "-v [version] batch-shipyard version"
+            echo ""
+            exit 1
+            ;;
+        a)
+            aad_cloud=${OPTARG,,}
+            ;;
+        c)
+            IFS=':' read -ra cont <<< "${OPTARG,,}"
+            controller_primary=${cont[0]}
+            if [ "$controller_primary" == "$HOSTNAME" ]; then
+                is_primary=1
+            fi
+            controller_secondary=${cont[1]}
+            controller_tertiary=${cont[2]}
+            num_controllers=${#cont[@]}
+            ;;
+        i)
+            IFS='-' read -ra clus <<< "${OPTARG,,}"
+            cluster_id=${OPTARG}
+            cluster_name=${clus[0]}
+            ;;
+        l)
+            is_login_node=1
+            ;;
+        m)
+            IFS=',' read -ra sc_args <<< "${OPTARG,,}"
+            ;;
+        p)
+            slurm_state_path=${OPTARG}
+            ;;
+        s)
+            IFS=':' read -ra ss <<< "${OPTARG,,}"
+            storage_account=${ss[0]}
+            storage_rg=${ss[1]}
+            storage_prefix=${ss[2]}
+            ;;
+        u)
+            cluster_user=${OPTARG}
+            ;;
+        v)
+            shipyardversion=$OPTARG
+            ;;
+    esac
+done
+shift $((OPTIND-1))
+[ "$1" = "--" ] && shift
+
+check_for_buggy_ntfs_mount() {
+    # Check to ensure sdb1 mount is not mounted as ntfs
+    set +e
+    mount | grep /dev/sdb1 | grep fuseblk
+    local rc=$?
+    set -e
+    if [ $rc -eq 0 ]; then
+        log ERROR "/dev/sdb1 temp disk is mounted as fuseblk/ntfs"
+        exit 1
+    fi
+}
+
+execute_command_with_retry() {
+    local retries=30
+    set +e
+    while [ $retries -gt 0 ]; do
+        "$@"
+        rc=$?
+        if [ $rc -eq 0 ]; then
+            break
+        fi
+        retries=$((retries-1))
+        if [ $retries -eq 0 ]; then
+            log ERROR "Command failed: $*"
+            exit $rc
+        fi
+        sleep 1
+    done
+    set -e
+}
+
+download_file_as() {
+    log INFO "Downloading: $1 as $2"
+    local retries=10
+    set +e
+    while [ $retries -gt 0 ]; do
+        if curl -fSsL "$1" -o "$2"; then
+            break
+        fi
+        retries=$((retries-1))
+        if [ $retries -eq 0 ]; then
+            log ERROR "Could not download: $1"
+            exit 1
+        fi
+        sleep 1
+    done
+    set -e
+}
+
+add_repo() {
+    local url=$1
+    set +e
+    local retries=120
+    local rc
+    while [ $retries -gt 0 ]; do
+        if [ "$PACKAGER" == "apt" ]; then
+            curl -fSsL "$url" | apt-key add -
+            rc=$?
+        elif [ "$PACKAGER" == "yum" ]; then
+            yum-config-manager --add-repo "$url"
+            rc=$?
+        elif [ "$PACKAGER" == "zypper" ]; then
+            zypper addrepo "$url"
+            rc=$?
+        fi
+        if [ $rc -eq 0 ]; then
+            break
+        fi
+        retries=$((retries-1))
+        if [ $retries -eq 0 ]; then
+            log ERROR "Could not add repo: $url"
+            exit 1
+        fi
+        sleep 1
+    done
+    set -e
+}
+
+refresh_package_index() {
+    set +e
+    local retries=120
+    local rc
+    while [ $retries -gt 0 ]; do
+        if [ "$PACKAGER" == "apt" ]; then
+            apt-get update
+            rc=$?
+        elif [ "$PACKAGER" == "yum" ]; then
+            yum makecache -y fast
+            rc=$?
+        elif [ "$PACKAGER" == "zypper" ]; then
+            zypper -n --gpg-auto-import-keys ref
+            rc=$?
+        fi
+        if [ $rc -eq 0 ]; then
+            break
+        fi
+        retries=$((retries-1))
+        if [ $retries -eq 0 ]; then
+            log ERROR "Could not update package index"
+            exit 1
+        fi
+        sleep 1
+    done
+    set -e
+}
+
+install_packages() {
+    set +e
+    local retries=120
+    local rc
+    while [ $retries -gt 0 ]; do
+        if [ "$PACKAGER" == "apt" ]; then
+            apt-get install -y -q -o Dpkg::Options::="--force-confnew" --no-install-recommends "$@"
+            rc=$?
+        elif [ "$PACKAGER" == "yum" ]; then
+            yum install -y "$@"
+            rc=$?
+        elif [ "$PACKAGER" == "zypper" ]; then
+            zypper -n in "$@"
+            rc=$?
+        fi
+        if [ $rc -eq 0 ]; then
+            break
+        fi
+        retries=$((retries-1))
+        if [ $retries -eq 0 ]; then
+            log ERROR "Could not install packages ($PACKAGER): $*"
+            exit 1
+        fi
+        sleep 1
+    done
+    set -e
+}
+
+install_docker_host_engine() {
+    log DEBUG "Installing Docker Host Engine"
+    # set vars
+    local srvstart="systemctl start docker.service"
+    local srvstop="systemctl stop docker.service"
+    local srvdisable="systemctl disable docker.service"
+    local srvstatus="systemctl --no-pager status docker.service"
+    if [ "$PACKAGER" == "apt" ]; then
+        local repo=https://download.docker.com/linux/"${DISTRIB_ID}"
+        local gpgkey="${repo}"/gpg
+        local dockerversion="${DOCKER_CE_PACKAGE_DEBIAN}${DISTRIB_ID}-${DISTRIB_CODENAME}"
+        local prereq_pkgs="apt-transport-https ca-certificates curl gnupg2 software-properties-common"
+    elif [ "$PACKAGER" == "yum" ]; then
+        local repo=https://download.docker.com/linux/centos/docker-ce.repo
+        local dockerversion="${DOCKER_CE_PACKAGE_CENTOS}"
+        local prereq_pkgs="yum-utils device-mapper-persistent-data lvm2"
+    elif [ "$PACKAGER" == "zypper" ]; then
+        if [[ "$DISTRIB_RELEASE" == 12-sp3* ]]; then
+            local repodir=SLE_12_SP3
+        fi
+        local repo="http://download.opensuse.org/repositories/Virtualization:containers/${repodir}/Virtualization:containers.repo"
+        local dockerversion="${DOCKER_CE_PACKAGE_SLES}"
+    fi
+    # refresh package index
+    refresh_package_index
+    # install required software first
+    # shellcheck disable=SC2086
+    install_packages $prereq_pkgs
+    if [ "$PACKAGER" == "apt" ]; then
+        # add gpgkey for repo
+        add_repo "$gpgkey"
+        # add repo
+        add-apt-repository "deb [arch=amd64] $repo $(lsb_release -cs) stable"
+    else
+        add_repo "$repo"
+    fi
+    # refresh index
+    refresh_package_index
+    # install docker engine
+    install_packages "$dockerversion"
+    # disable docker from auto-start due to temp disk issues
+    $srvstop
+    $srvdisable
+    # ensure docker daemon modifications are idempotent
+    rm -rf /var/lib/docker
+    mkdir -p /etc/docker
+    echo "{ \"data-root\": \"$USER_MOUNTPOINT/docker\", \"hosts\": [ \"unix:///var/run/docker.sock\", \"tcp://127.0.0.1:2375\" ] }" > /etc/docker/daemon.json
+    # ensure no options are specified after dockerd
+    sed -i 's|^ExecStart=/usr/bin/dockerd.*|ExecStart=/usr/bin/dockerd|' "${SYSTEMD_PATH}"/docker.service
+    systemctl daemon-reload
+    $srvstart
+    $srvstatus
+    docker info
+    log INFO "Docker Host Engine installed"
+}
+
+install_storage_cluster_dependencies() {
+    if [ -z "$sc_args" ]; then
+        return
+    fi
+    log DEBUG "Installing storage cluster dependencies"
+    if [ "$PACKAGER" == "zypper" ]; then
+        if [[ "$DISTRIB_RELEASE" == 12-sp3* ]]; then
+            local repodir=SLE_12_SP3
+        fi
+        local repo="http://download.opensuse.org/repositories/filesystems/${repodir}/filesystems.repo"
+    fi
+    for sc_arg in "${sc_args[@]}"; do
+        IFS=':' read -ra sc <<< "$sc_arg"
+        server_type=${sc[0]}
+        if [ "$server_type" == "nfs" ]; then
+            if [ "$PACKAGER" == "apt" ]; then
+                install_packages nfs-common nfs4-acl-tools
+            elif [ "$PACKAGER" == "yum" ] ; then
+                install_packages nfs-utils nfs4-acl-tools
+                systemctl enable rpcbind
+                systemctl start rpcbind
+            elif [ "$PACKAGER" == "zypper" ]; then
+                install_packages nfs-client nfs4-acl-tools
+                systemctl enable rpcbind
+                systemctl start rpcbind
+            fi
+        elif [ "$server_type" == "glusterfs" ]; then
+            if [ "$PACKAGER" == "apt" ]; then
+                if [ "$DISTRIB_ID" == "debian" ]; then
+                    add_repo "http://download.gluster.org/pub/gluster/glusterfs/${GLUSTER_VERSION_DEBIAN}/rsa.pub"
+                else
+                    add-apt-repository ppa:gluster/glusterfs-${GLUSTER_VERSION_DEBIAN}
+                fi
+                install_packages glusterfs-client acl
+            elif [ "$PACKAGER" == "yum" ] ; then
+                install_packages centos-release-gluster${GLUSTER_VERSION_CENTOS}
+                install_packages glusterfs-server acl
+            elif [ "$PACKAGER" == "zypper" ]; then
+                add_repo "$repo"
+                "$PACKAGER" -n --gpg-auto-import-keys ref
+                install_packages glusterfs acl
+            fi
+        else
+            log ERROR "Unknown file server type ${sc[0]} for ${sc[1]}"
+            exit 1
+        fi
+    done
+    log INFO "Storage cluster dependencies installed"
+}
+
+process_fstab_entry() {
+    local desc=$1
+    local fstab_entry=$2
+    IFS=' ' read -ra fs <<< "$fstab_entry"
+    local mountpoint="${fs[1]}"
+    log INFO "Creating host directory for $desc at $mountpoint"
+    mkdir -p "$mountpoint"
+    chmod 777 "$mountpoint"
+    echo "INFO: Adding $mountpoint to fstab"
+    echo "$fstab_entry" >> /etc/fstab
+    tail -n1 /etc/fstab
+    echo "INFO: Mounting $mountpoint"
+    local START
+    START=$(date -u +"%s")
+    set +e
+    while :
+    do
+        if mount "$mountpoint"; then
+            break
+        else
+            local NOW
+            NOW=$(date -u +"%s")
+            local DIFF=$(((NOW-START)/60))
+            # fail after 5 minutes of attempts
+            if [ $DIFF -ge 5 ]; then
+                echo "ERROR: Could not mount $desc on $mountpoint"
+                exit 1
+            fi
+            sleep 1
+        fi
+    done
+    set -e
+    log INFO "$mountpoint mounted."
+}
+
+process_storage_clusters() {
+    if [ -n "$sc_args" ]; then
+        log DEBUG "Processing storage clusters"
+        IFS='#' read -ra fstabs <<< "$SHIPYARD_STORAGE_CLUSTER_FSTAB"
+        i=0
+        for sc_arg in "${sc_args[@]}"; do
+            IFS=':' read -ra sc <<< "$sc_arg"
+            fstab_entry="${fstabs[$i]//,noauto/,auto}"
+            process_fstab_entry "$sc_arg" "$fstab_entry"
+            i=$((i + 1))
+        done
+        log INFO "Storage clusters processed"
+    fi
+}
+
+install_systemd_unit_file() {
+cat << EOF > /etc/systemd/system/batch-shipyard-slurm.service
+[Unit]
+Description=Batch Shipyard Slurm Helper
+After=network.target network-online.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+TimeoutStartSec=0
+Restart=always
+LimitNOFILE=65536
+LimitCORE=infinity
+OOMScoreAdjust=-100
+IOSchedulingClass=best-effort
+IOSchedulingPriority=0
+Environment=LC_CTYPE=en_US.UTF-8 PYTHONIOENCODING=utf-8
+WorkingDirectory=/var/batch-shipyard
+ExecStart=${SHIPYARD_SLURM_PY} daemon --conf ${SHIPYARD_CONF_FILE}
+StandardOutput=null
+
+[Install]
+WantedBy=multi-user.target
+EOF
+    systemctl daemon-reload
+    log INFO "systemd unit file installed"
+}
+
+create_batch_shipyard_slurm_config() {
+    mkdir -p ${SHIPYARD_VAR_DIR}
+    chmod 755 ${SHIPYARD_VAR_DIR}
+    # get timeouts
+    local resume_timeout
+    local suspend_timeout
+    resume_timeout=$(grep '^ResumeTimeout=' slurm.conf | cut -d '=' -f 2)
+    suspend_timeout=$(grep '^SuspendTimeout=' slurm.conf | cut -d '=' -f 2)
+cat > ${SHIPYARD_CONF_FILE} << EOF
+{
+    "aad_cloud": "$aad_cloud",
+    "storage": {
+        "account": "$storage_account",
+        "resource_group": "$storage_rg",
+        "entity_prefix": "$storage_prefix",
+        "queues": {
+            "action": "$cluster_id"
+        },
+        "azfile_mount_dir": "$AZFILE_MOUNT_DIR"
+    },
+    "cluster_id": "$cluster_id",
+    "cluster_name": "$cluster_name",
+    "logging_id": "$HOSTNAME",
+    "is_primary": "$is_primary",
+    "timeouts": {
+        "resume": $resume_timeout,
+        "suspend": $suspend_timeout
+    },
+    "batch_shipyard": {
+        "var_path": "$SHIPYARD_VAR_DIR",
+        "version": "$shipyardversion"
+    }
+}
+EOF
+    chmod 600 "$SHIPYARD_CONF_FILE"
+    log INFO "Batch Shipyard slurm config created"
+}
+
+log INFO "Bootstrap start"
+echo "Configuration:"
+echo "--------------"
+echo "OS Distribution: $DISTRIB_ID $DISTRIB_RELEASE $DISTRIB_CODENAME"
+echo "Hostname: $HOSTNAME"
+echo "Batch Shipyard Version: $shipyardversion"
+echo "AAD cloud: $aad_cloud"
+echo "Storage: $storage_account:$storage_rg:$storage_prefix"
+echo "Storage cluster mount: ${sc_args[*]}"
+echo "Cluster Id: $cluster_id"
+echo "Cluster Name: $cluster_name"
+echo "Cluster user: $cluster_user"
+echo "Controllers: $controller_primary backups($controller_secondary,$controller_tertiary)"
+echo "Number of controllers: $num_controllers"
+echo "Is Primary Controller: $is_primary"
+echo "Is Login node: $is_login_node"
+echo ""
+
+if [ "$is_primary" -eq 1 ] && [ "$is_login_node" -eq 1 ]; then
+    log ERROR "Cannot be designated as primary and login simultaneously"
+    exit 1
+fi
+
+# check sdb1 mount
+check_for_buggy_ntfs_mount
+
+# set sudoers to not require tty
+sed -i 's/^Defaults[ ]*requiretty/# Defaults requiretty/g' /etc/sudoers
+
+# install docker
+install_docker_host_engine
+
+# install required base software
+install_packages build-essential libffi-dev libssl-dev python3-dev
+curl -fSsL https://bootstrap.pypa.io/get-pip.py | python3
+
+# check or install dependencies for storage cluster mount
+if [ -n "$sc_args" ]; then
+    install_storage_cluster_dependencies
+fi
+# process and mount storage clusters
+process_storage_clusters
+
+# write batch shipyard config
+create_batch_shipyard_slurm_config
+
+# align uid/gid/permissions to batch pool
+usermod -u 2000 "$cluster_user"
+groupmod -n _azbatchgrp "$cluster_user"
+chown -R "${cluster_user}:_azbatchgrp" "/home/${cluster_user}"
+useradd -o -u 1000 -N -g 1000 -p '!' -s /bin/bash -m -d /home/_azbatch _azbatch
+
+# install program deps and copy main program
+pip3 install -r requirements.txt
+chmod 755 slurm.py
+cp -f slurm.py "$SHIPYARD_SLURM_PY"
+
+# add slurm user
+groupadd -g 64030 slurm
+useradd -u 64030 -N -g 64030 -p '!' -s /bin/bash -m -d /home/slurm slurm
+slurm_uid=$(id -u slurm)
+slurm_gid=$(id -g slurm)
+
+# install all slurm-related packages
+if [ "$is_login_node" -eq 1 ]; then
+    install_packages munge
+else
+    install_packages munge
+    if [ "$is_primary" -eq 1 ]; then
+        install_packages mariadb-server libmysqlclient20 libmariadb3
+    fi
+fi
+slurm_docker_image="alfpark/slurm:${SLURM_VERSION}-${DISTRIB_ID}-${DISTRIB_RELEASE}"
+docker pull "$slurm_docker_image"
+mkdir -p /tmp/slurm
+docker run --rm -v /tmp/slurm:/tmp/slurm "$slurm_docker_image" \
+    /bin/sh -c 'cp -r /root/* /tmp/slurm/'
+dpkg -i "/tmp/slurm/slurm-${SLURM_VERSION}_1.0_amd64.deb"
+if [ "$is_login_node" -eq 0 ]; then
+    cp /tmp/slurm/slurmctld.service /etc/systemd/system/
+    if [ "$is_primary" -eq 1 ]; then
+        cp /tmp/slurm/slurmdbd.service /etc/systemd/system/
+    fi
+fi
+rm -rf /tmp/slurm
+docker rmi "$slurm_docker_image"
+mkdir -p "$SLURM_CONF_DIR" /var/spool/slurm /var/log/slurm
+chown -R slurm:slurm /var/spool/slurm /var/log/slurm
+cat << EOF > "/etc/ld.so.conf.d/slurm.conf"
+/usr/lib/slurm
+EOF
+ldconfig
+ldconfig -p | grep libslurmfull
+systemctl daemon-reload
+
+# retrieve storage account key and endpoint
+echo "Retrieving storage account credentials for fileshare"
+sa=$(${SHIPYARD_SLURM_PY} sakey --conf "${SHIPYARD_CONF_FILE}")
+IFS=' ' read -ra ss <<< "${sa}"
+storage_key=${ss[0]}
+storage_ep=${ss[1]}
+storage_ep="${storage_ep%"${storage_ep##*[![:space:]]}"}"
+
+# mount Azure file share
+cat << EOF > "/root/.azfile_creds"
+username=$storage_account
+password=$storage_key
+EOF
+chmod 600 /root/.azfile_creds
+mkdir -p "$AZFILE_MOUNT_DIR"
+chmod 755 "$AZFILE_MOUNT_DIR"
+share="${storage_prefix}slurm"
+echo "//${storage_account}.file.${storage_ep}/${share} ${AZFILE_MOUNT_DIR} cifs vers=3.0,credentials=/root/.azfile_creds,uid=${slurm_uid},gid=${slurm_gid},_netdev,serverino 0 0" >> /etc/fstab
+mount "$AZFILE_MOUNT_DIR"
+
+azfile_cluster_path="${AZFILE_MOUNT_DIR}/${cluster_id}"
+mkdir -p "$azfile_cluster_path"
+
+slurm_log_path="${azfile_cluster_path}/slurm/logs"
+mkdir -p "$slurm_log_path"
+
+# create resume/suspend scripts
+if [ "$is_login_node" -eq 0 ]; then
+    resume_script="${SHIPYARD_VAR_DIR}/resume.sh"
+    resume_fail_script="${SHIPYARD_VAR_DIR}/resume-fail.sh"
+    suspend_script="${SHIPYARD_VAR_DIR}/suspend.sh"
+cat > ${resume_script} << 'EOF'
+#!/usr/bin/env bash
+
+hostfile="$(mktemp /tmp/slurm_resume.XXXXXX)"
+
+hosts=$(scontrol show hostnames $1)
+touch $hostfile
+for host in $hosts; do
+    part=$(sinfo -h -n $host -N -o "%R")
+    echo "$host $part" >> $hostfile
+done
+
+EOF
+
+cat >> ${resume_script} << EOF
+${SHIPYARD_SLURM_PY} resume --conf ${SHIPYARD_CONF_FILE} \\
+EOF
+
+cat >> ${resume_script} << 'EOF'
+    --hostfile $hostfile \
+EOF
+
+cat >> ${resume_script} << EOF
+    >> ${slurm_log_path}/power-save.log 2>&1
+EOF
+
+cat >> ${resume_script} << 'EOF'
+ec=$?
+rm -f $hostfile
+exit $ec
+EOF
+
+cat > ${resume_fail_script} << 'EOF'
+#!/usr/bin/env bash
+
+hostfile="$(mktemp /tmp/slurm_resume_fail.XXXXXX)"
+
+hosts=$(scontrol show hostnames $1)
+touch $hostfile
+for host in $hosts; do
+    part=$(sinfo -h -n $host -N -o "%R")
+    echo "$host $part" >> $hostfile
+done
+
+EOF
+
+cat >> ${resume_fail_script} << EOF
+${SHIPYARD_SLURM_PY} resume-fail --conf ${SHIPYARD_CONF_FILE} \\
+EOF
+
+cat >> ${resume_fail_script} << 'EOF'
+    --hostfile $hostfile \
+EOF
+
+cat >> ${resume_fail_script} << EOF
+    >> ${slurm_log_path}/power-save.log 2>&1
+EOF
+
+cat >> ${resume_fail_script} << 'EOF'
+ec=$?
+rm -f $hostfile
+exit $ec
+EOF
+
+
+cat > ${suspend_script} << 'EOF'
+#!/usr/bin/env bash
+
+hostfile="$(mktemp /tmp/slurm_resume.XXXXXX)"
+
+scontrol show hostnames $1 > $hostfile
+EOF
+
+cat >> ${suspend_script} << EOF
+${SHIPYARD_SLURM_PY} suspend --conf ${SHIPYARD_CONF_FILE} \\
+EOF
+
+cat >> ${suspend_script} << 'EOF'
+    --hostfile $hostfile \
+EOF
+
+cat >> ${suspend_script} << EOF
+    >> ${slurm_log_path}/power-save.log 2>&1
+EOF
+
+cat >> ${suspend_script} << 'EOF'
+ec=$?
+rm -f $hostfile
+exit $ec
+EOF
+
+chmod 755 "${resume_script}" "${resume_fail_script}" "${suspend_script}"
+fi
+
+chown -R slurm:slurm "${SHIPYARD_VAR_DIR}"
+
+# configure munge
+shared_munge_key_path="${azfile_cluster_path}/munge"
+shared_munge_key="${shared_munge_key_path}/munge.key"
+# export munge key to storage
+if [ "$is_primary" -eq 1 ]; then
+    munge -n | unmunge
+    mkdir -p "$shared_munge_key_path"
+    cp -f /etc/munge/munge.key "$shared_munge_key"
+    # ensure munge key is "marked" read/write to prevent read-only deletion failures
+    chmod 660 "$shared_munge_key"
+else
+    # poll for munge key
+    echo "Waiting for primary munge key"
+    while [ ! -s "$shared_munge_key" ]; do
+        sleep 1
+    done
+    cp -f "$shared_munge_key" /etc/munge/munge.key
+    chmod 400 /etc/munge/munge.key
+    chown munge:munge /etc/munge/munge.key
+    munge -n | unmunge
+fi
+systemctl enable munge
+systemctl restart munge
+systemctl --no-pager status munge
+
+# start mariadb and prepare database
+if [ "$is_primary" -eq 1 ]; then
+    systemctl enable mariadb
+    systemctl start mariadb
+    systemctl --no-pager status mariadb
+
+    # create db table
+    chmod 600 slurmdb.sql
+    cp slurmdb.sql "${SLURM_CONF_DIR}/"
+    # shellcheck disable=SC2002
+    cat "${SLURM_CONF_DIR}/slurmdb.sql" | mysql -u root
+fi
+
+# copy and modify configuration files
+if [ "$is_primary" -eq 1 ]; then
+    # create state save location
+    mkdir -p "${slurm_state_path}"
+    chown -R slurm:slurm "${slurm_state_path}"
+    chmod 750 "${slurm_state_path}"
+
+    cp slurm.conf "${SLURM_CONF_DIR}/"
+    sed -i "s|{SHIPYARD_VAR_DIR}|${SHIPYARD_VAR_DIR}|g" "${SLURM_CONF_DIR}/slurm.conf"
+    sed -i "s|{SLURM_LOG_PATH}|${slurm_log_path}|g" "${SLURM_CONF_DIR}/slurm.conf"
+    sed -i "s|{HOSTNAME}|${HOSTNAME}|g" "${SLURM_CONF_DIR}/slurm.conf"
+    sed -i "s|{SLURMCTLD_STATE_SAVE_PATH}|${slurm_state_path}|g" "${SLURM_CONF_DIR}/slurm.conf"
+    sed -i "s|{SLURMCTLD_HOST_PRIMARY}|${controller_primary}|g" "${SLURM_CONF_DIR}/slurm.conf"
+    if [ -n "$controller_secondary" ]; then
+        sed -i "s|^#{SLURMCTLD_HOST_SECONDARY}|SlurmctldHost=${controller_secondary}|g" "${SLURM_CONF_DIR}/slurm.conf"
+    fi
+    if [ -n "$controller_tertiary" ]; then
+        sed -i "s|^#{SLURMCTLD_HOST_TERTIARY}|SlurmctldHost=${controller_tertiary}|g" "${SLURM_CONF_DIR}/slurm.conf"
+    fi
+
+    cp slurmdbd.conf "${SLURM_CONF_DIR}/"
+    sed -i "s|{SLURM_LOG_PATH}|${slurm_log_path}|g" "${SLURM_CONF_DIR}/slurmdbd.conf"
+    sed -i "s|{HOSTNAME}|${HOSTNAME}|g" "${SLURM_CONF_DIR}/slurmdbd.conf"
+
+    chown slurm:slurm "${SLURM_CONF_DIR}/slurm.conf"
+    chmod 644 "${SLURM_CONF_DIR}/slurm.conf"
+    chmod 600 "${SLURM_CONF_DIR}/slurmdbd.conf"
+fi
+
+# start slurm db service
+if [ "$is_primary" -eq 1 ]; then
+    systemctl enable slurmdbd
+    systemctl start slurmdbd
+    systemctl --no-pager status slurmdbd
+
+    # delay before executing as dbd may not be fully up
+    sleep 5
+
+    # initialize account in db
+    execute_command_with_retry sacctmgr -i add cluster "$cluster_name"
+    execute_command_with_retry sacctmgr -i add account compute-account description="Compute accounts" Organization="$cluster_name"
+    execute_command_with_retry sacctmgr -i create user "$cluster_user" account=compute-account adminlevel=None
+fi
+
+# copy config and block for secondary/tertiary
+# start slurm controller service
+slurm_conf_azfile_path="${azfile_cluster_path}/slurm/conf"
+if [ "$is_primary" -eq 1 ]; then
+    systemctl enable slurmctld
+    systemctl start slurmctld
+    systemctl --no-pager status slurmctld
+    mkdir -p "$slurm_conf_azfile_path"
+    cp "${SLURM_CONF_DIR}/slurm.conf" "${slurm_conf_azfile_path}/"
+    # ensure slurm conf is "marked" read/write to prevent read-only deletion failures
+    chmod 660 "${slurm_conf_azfile_path}/slurm.conf"
+else
+    echo "Waiting for primary Slurm configuration file"
+    while [ ! -s "${slurm_conf_azfile_path}/slurm.conf" ]; do
+        sleep 1
+    done
+    echo "Slurm configuration file found."
+    cp -f "${slurm_conf_azfile_path}/slurm.conf" "${SLURM_CONF_DIR}/slurm.conf"
+    chown slurm:slurm "${SLURM_CONF_DIR}/slurm.conf"
+    chmod 644 "${SLURM_CONF_DIR}/slurm.conf"
+    if [ "$is_login_node" -eq 0 ]; then
+        systemctl enable slurmctld
+        systemctl start slurmctld
+        systemctl --no-pager status slurmctld
+    fi
+fi
+
+# start daemon
+if [ "$is_login_node" -eq 0 ]; then
+    # setup systemd unit file
+    install_systemd_unit_file
+
+    # start batch shipyard slurm daemon mode
+    systemctl enable batch-shipyard-slurm
+    systemctl start batch-shipyard-slurm
+    systemctl --no-pager status batch-shipyard-slurm
+fi
+
+log INFO "Bootstrap completed"
--- a/shipyard.py
+++ b/shipyard.py
@ -56,6 +56,7 @@ class CliContext(object):
    """CliContext class: holds context for CLI commands"""
    def __init__(self):
        """Ctor for CliContext"""
+        self.cleanup = True
        self.show_config = False
        self.verbose = False
        self.yes = False
@ -67,6 +68,7 @@ class CliContext(object):
        self.conf_fs = None
        self.conf_monitor = None
        self.conf_federation = None
+        self.conf_slurm = None
        # clients
        self.batch_mgmt_client = None
        self.batch_client = None
@ -221,6 +223,50 @@ class CliContext(object):
            convoy.clients.create_storage_clients()
        self._cleanup_after_initialize()

+    def initialize_for_slurm(self, init_batch=False):
+        # type: (CliContext, bool) -> None
+        """Initialize context for slurm commands
+        :param CliContext self: this
+        :param bool init_batch: initialize batch
+        """
+        self._read_credentials_config()
+        self._set_global_cli_options()
+        if self.verbose:
+            logger.debug('initializing for slurm actions')
+        self._init_keyvault_client()
+        self._init_config(
+            skip_global_config=False, skip_pool_config=not init_batch,
+            skip_monitor_config=True, skip_federation_config=True,
+            fs_storage=not init_batch)
+
+        self.conf_slurm = self._form_conf_path(
+            self.conf_slurm, 'slurm')
+        if self.conf_slurm is None:
+            raise ValueError('slurm conf file was not specified')
+        self.conf_slurm = CliContext.ensure_pathlib_conf(
+            self.conf_slurm)
+        convoy.validator.validate_config(
+            convoy.validator.ConfigType.Slurm, self.conf_slurm)
+        self._read_config_file(self.conf_slurm)
+
+        self._ensure_credentials_section('storage')
+        self._ensure_credentials_section('slurm')
+        self.auth_client, self.resource_client, self.compute_client, \
+            self.network_client, self.storage_mgmt_client, \
+            self.batch_mgmt_client, self.batch_client = \
+            convoy.clients.create_all_clients(
+                self, batch_clients=init_batch)
+        # inject storage account keys if via aad
+        convoy.fleet.fetch_storage_account_keys_from_aad(
+            self.storage_mgmt_client, self.config, fs_storage=not init_batch)
+        # call populate global settings again to adjust for slurm storage
+        sc = convoy.settings.slurm_credentials_storage(self.config)
+        convoy.fleet.populate_global_settings(
+            self.config, fs_storage=not init_batch, sc=sc)
+        self.blob_client, self.table_client, self.queue_client = \
+            convoy.clients.create_storage_clients()
+        self._cleanup_after_initialize()
+
    def initialize_for_keyvault(self):
        # type: (CliContext) -> None
        """Initialize context for keyvault commands
@ -311,6 +357,8 @@ class CliContext(object):
        """Cleanup after initialize_for_* funcs
        :param CliContext self: this
        """
+        if not self.cleanup:
+            return
        # free conf objects
        del self.conf_credentials
        del self.conf_fs
@ -319,6 +367,7 @@ class CliContext(object):
        del self.conf_jobs
        del self.conf_monitor
        del self.conf_federation
+        del self.conf_slurm
        # free cli options
        del self.verbose
        del self.yes
@ -860,6 +909,19 @@ def monitor_option(f):
        callback=callback)(f)


+def slurm_option(f):
+    def callback(ctx, param, value):
+        clictx = ctx.ensure_object(CliContext)
+        clictx.conf_slurm = value
+        return value
+    return click.option(
+        '--slurm',
+        expose_value=False,
+        envvar='SHIPYARD_SLURM_CONF',
+        help='Slurm config file',
+        callback=callback)(f)
+
+
 def _storage_cluster_id_argument(f):
    def callback(ctx, param, value):
        return value
@ -930,6 +992,12 @@ def federation_options(f):
    return f


+def slurm_options(f):
+    f = slurm_option(f)
+    f = _azure_subscription_id_option(f)
+    return f
+
+
@click.group(context_settings=_CONTEXT_SETTINGS)
@click.version_option(version=convoy.__version__)
@click.pass_context
@ -1019,6 +1087,23 @@ def fs_cluster_add(ctx, storage_cluster_id):
        ctx.blob_client, ctx.config, storage_cluster_id)


+@cluster.command('orchestrate')
+@common_options
+@fs_cluster_options
+@keyvault_options
+@aad_options
+@pass_cli_context
+def fs_cluster_orchestrate(ctx, storage_cluster_id):
+    """Orchestrate a filesystem storage cluster in Azure with the
+    specified disks"""
+    ctx.initialize_for_fs()
+    convoy.fleet.action_fs_disks_add(
+        ctx.resource_client, ctx.compute_client, ctx.config)
+    convoy.fleet.action_fs_cluster_add(
+        ctx.resource_client, ctx.compute_client, ctx.network_client,
+        ctx.blob_client, ctx.config, storage_cluster_id)
+
+
@cluster.command('resize')
@common_options
@fs_cluster_options
@ -2807,6 +2892,180 @@ def fed_jobs_zap(
        ctx.blob_client, ctx.config, federation_id, unique_id)


+@cli.group()
+@pass_cli_context
+def slurm(ctx):
+    """Slurm on Batch actions"""
+    pass
+
+
+@slurm.group()
+@pass_cli_context
+def ssh(ctx):
+    """Slurm SSH actions"""
+    pass
+
+
+@ssh.command('controller')
+@click.option(
+    '--offset', help='Controller VM offset')
+@click.option(
+    '--tty', is_flag=True, help='Allocate a pseudo-tty')
+@common_options
+@slurm_options
+@click.argument('command', nargs=-1)
+@keyvault_options
+@aad_options
+@pass_cli_context
+def slurm_ssh_controller(ctx, offset, tty, command):
+    """Interactively login via SSH to a Slurm controller virtual
+    machine in Azure"""
+    ctx.initialize_for_slurm()
+    convoy.fleet.action_slurm_ssh(
+        ctx.compute_client, ctx.network_client, None, None, ctx.config,
+        tty, command, 'controller', offset, None)
+
+
+@ssh.command('login')
+@click.option(
+    '--offset', help='Controller VM offset')
+@click.option(
+    '--tty', is_flag=True, help='Allocate a pseudo-tty')
+@common_options
+@slurm_options
+@click.argument('command', nargs=-1)
+@keyvault_options
+@aad_options
+@pass_cli_context
+def slurm_ssh_login(ctx, offset, tty, command):
+    """Interactively login via SSH to a Slurm login/gateway virtual
+    machine in Azure"""
+    ctx.initialize_for_slurm()
+    convoy.fleet.action_slurm_ssh(
+        ctx.compute_client, ctx.network_client, None, None, ctx.config,
+        tty, command, 'login', offset, None)
+
+
+@ssh.command('node')
+@click.option(
+    '--node-name', help='Slurm node name')
+@click.option(
+    '--tty', is_flag=True, help='Allocate a pseudo-tty')
+@common_options
+@slurm_options
+@click.argument('command', nargs=-1)
+@keyvault_options
+@aad_options
+@pass_cli_context
+def slurm_ssh_node(ctx, node_name, tty, command):
+    """Interactively login via SSH to a Slurm compute node virtual
+    machine in Azure"""
+    ctx.initialize_for_slurm(init_batch=True)
+    if convoy.util.is_none_or_empty(node_name):
+        raise ValueError('node name must be specified')
+    convoy.fleet.action_slurm_ssh(
+        ctx.compute_client, ctx.network_client, ctx.table_client,
+        ctx.batch_client, ctx.config, tty, command, 'node', None, node_name)
+
+
+@slurm.group()
+@pass_cli_context
+def cluster(ctx):
+    """Slurm cluster actions"""
+    pass
+
+
+@cluster.command('create')
+@common_options
+@slurm_options
+@keyvault_options
+@aad_options
+@pass_cli_context
+def slurm_cluster_create(ctx):
+    """Create a Slurm cluster with controllers and login nodes"""
+    ctx.initialize_for_slurm(init_batch=True)
+    convoy.fleet.action_slurm_cluster_create(
+        ctx.auth_client, ctx.resource_client, ctx.compute_client,
+        ctx.network_client, ctx.blob_client, ctx.table_client,
+        ctx.queue_client, ctx.batch_client, ctx.config)
+
+
+@cluster.command('orchestrate')
+@click.option(
+    '--storage-cluster-id', help='Storage cluster id to create')
+@common_options
+@slurm_options
+@batch_options
+@keyvault_options
+@aad_options
+@pass_cli_context
+def slurm_cluster_orchestrate(ctx, storage_cluster_id):
+    """Orchestrate a Slurm cluster with shared file system and Batch pool"""
+    if convoy.util.is_not_empty(storage_cluster_id):
+        ctx.cleanup = False
+        ctx.initialize_for_fs()
+        convoy.fleet.action_fs_disks_add(
+            ctx.resource_client, ctx.compute_client, ctx.config)
+        convoy.fleet.action_fs_cluster_add(
+            ctx.resource_client, ctx.compute_client, ctx.network_client,
+            ctx.blob_client, ctx.config, storage_cluster_id)
+        ctx.cleanup = True
+    else:
+        logger.warning(
+            'skipping fs cluster orchestration as no storage cluster id '
+            'was specified')
+    ctx.initialize_for_slurm(init_batch=True)
+    convoy.fleet.action_pool_add(
+        ctx.resource_client, ctx.compute_client, ctx.network_client,
+        ctx.batch_mgmt_client, ctx.batch_client, ctx.blob_client,
+        ctx.table_client, ctx.keyvault_client, ctx.config)
+    convoy.fleet.action_slurm_cluster_create(
+        ctx.auth_client, ctx.resource_client, ctx.compute_client,
+        ctx.network_client, ctx.blob_client, ctx.table_client,
+        ctx.queue_client, ctx.batch_client, ctx.config)
+
+
+@cluster.command('status')
+@common_options
+@slurm_options
+@keyvault_options
+@aad_options
+@pass_cli_context
+def slurm_cluster_status(ctx):
+    """Query status of a Slurm controllers and login nodes"""
+    ctx.initialize_for_slurm()
+    convoy.fleet.action_slurm_cluster_status(
+        ctx.compute_client, ctx.network_client, ctx.config)
+
+
+@cluster.command('destroy')
+@click.option(
+    '--delete-resource-group', is_flag=True,
+    help='Delete all resources in the Slurm controller resource group')
+@click.option(
+    '--delete-virtual-network', is_flag=True, help='Delete virtual network')
+@click.option(
+    '--generate-from-prefix', is_flag=True,
+    help='Generate resources to delete from Slurm controller hostname prefix')
+@click.option(
+    '--no-wait', is_flag=True, help='Do not wait for deletion to complete')
+@common_options
+@slurm_options
+@keyvault_options
+@aad_options
+@pass_cli_context
+def slurm_cluster_destroy(
+        ctx, delete_resource_group, delete_virtual_network,
+        generate_from_prefix, no_wait):
+    """Destroy a Slurm controller"""
+    ctx.initialize_for_slurm(init_batch=True)
+    convoy.fleet.action_slurm_cluster_destroy(
+        ctx.resource_client, ctx.compute_client, ctx.network_client,
+        ctx.blob_client, ctx.table_client, ctx.queue_client, ctx.config,
+        delete_resource_group, delete_virtual_network, generate_from_prefix,
+        not no_wait)
+
+
 if __name__ == '__main__':
    convoy.util.setup_logger(logger)
    cli()
--- a/slurm/Dockerfile
+++ b/slurm/Dockerfile
@ -0,0 +1,25 @@
+# Dockerfile for Azure/batch-shipyard (Slurm)
+
+FROM alpine:3.9
+MAINTAINER Fred Park <https://github.com/Azure/batch-shipyard>
+
+# copy in files
+COPY slurm.py requirements.txt /opt/batch-shipyard/
+
+# add base packages and python dependencies
+RUN apk update \
+    && apk add --update --no-cache \
+        musl build-base python3 python3-dev openssl-dev libffi-dev \
+        ca-certificates cifs-utils bash \
+    && python3 -m pip install --no-cache-dir --upgrade pip \
+    && pip3 install --no-cache-dir --upgrade -r /opt/batch-shipyard/requirements.txt \
+    && apk del --purge \
+        build-base python3-dev openssl-dev libffi-dev \
+    && rm /var/cache/apk/* \
+    && rm -f /opt/batch-shipyard/requirements.txt
+
+# pre-compile files
+RUN python3 -m compileall -f /opt/batch-shipyard
+
+# set entrypoint
+ENTRYPOINT ["python3", "/opt/batch-shipyard/slurm.py"]
--- a/slurm/requirements.txt
+++ b/slurm/requirements.txt
@ -0,0 +1,8 @@
+azure-batch==6.0.0
+azure-cosmosdb-table==1.0.5
+azure-mgmt-storage==3.1.1
+azure-mgmt-resource==2.1.0
+azure-storage-queue==1.4.0
+msrestazure==0.5.1
+python-dateutil==2.7.5
+requests==2.21.0
--- a/slurm/slurm.conf
+++ b/slurm/slurm.conf
@ -0,0 +1,119 @@
+#
+# See the slurm.conf man page for more information.
+#
+ClusterName={CLUSTER_NAME}
+SlurmctldHost={SLURMCTLD_HOST_PRIMARY}
+#{SLURMCTLD_HOST_SECONDARY}
+#{SLURMCTLD_HOST_TERTIARY}
+
+SlurmUser=slurm
+SlurmctldPort=6817
+SlurmdPort=6818
+
+AuthType=auth/munge
+
+StateSaveLocation={SLURMCTLD_STATE_SAVE_PATH}
+
+#SlurmdSpoolDir=/var/lib/slurm-llnl/slurmd
+#SlurmctldPidFile=/var/run/slurm-llnl/slurmctld.pid
+#SlurmdPidFile=/var/run/slurm-llnl/slurmd.pid
+#PluginDir=/usr/lib/x86_64-linux-gnu/slurm-wlm
+
+SlurmdSpoolDir=/var/spool/slurm/slurmd
+SlurmctldPidFile=/var/run/slurmctld.pid
+SlurmdPidFile=/var/run/slurmd.pid
+PluginDir=/usr/lib/slurm
+
+# LOGGING
+SlurmctldDebug=5
+# slurmctld log file has no %h option, so need to log locally
+#SlurmctldLogFile=/var/log/slurm-llnl/slurmctld.log
+SlurmctldLogFile=/var/log/slurm/slurmctld.log
+SlurmdDebug=5
+SlurmdLogFile={SLURM_LOG_PATH}/slurmd-%h.log
+
+# PROCESS TRACKING
+#ProctrackType=proctrack/pgid
+ProctrackType=proctrack/cgroup
+SwitchType=switch/none
+MpiDefault=none
+
+#FirstJobId=
+ReturnToService=1
+#MaxJobCount=
+#PlugStackConfig=
+#PropagatePrioProcess=
+#PropagateResourceLimits=
+#PropagateResourceLimitsExcept=
+#Prolog=/etc/slurm/prolog.d/*
+#Epilog=/etc/slurm/epilog.d/*
+#SrunProlog=
+#SrunEpilog=
+#TaskProlog=
+#TaskEpilog=
+
+TaskPlugin=task/affinity,task/cgroup
+#TrackWCKey=no
+#TmpFS=
+#UsePAM=
+
+# TIMERS
+SlurmctldTimeout=300
+SlurmdTimeout=300
+InactiveLimit=0
+MinJobAge=300
+KillWait=30
+Waittime=0
+
+# SCHEDULING
+SchedulerType=sched/backfill
+#SchedulerAuth=
+SelectType=select/linear
+#SelectType=select/cons_res
+#SelectTypeParameters=CR_Core_Memory,CR_CORE_DEFAULT_DIST_BLOCK,CR_ONE_TASK_PER_CORE
+FastSchedule=1
+#PriorityType=priority/multifactor
+#PriorityDecayHalfLife=14-0
+#PriorityUsageResetPeriod=14-0
+#PriorityWeightFairshare=100000
+#PriorityWeightAge=1000
+#PriorityWeightPartition=10000
+#PriorityWeightJobSize=1000
+#PriorityMaxAge=1-0
+
+JobCompType=jobcomp/none
+#JobCompLoc=
+
+# ACCOUNTING
+JobAcctGatherType=jobacct_gather/cgroup
+#JobAcctGatherFrequency=30
+
+AccountingStorageTRES=gres/gpu
+DebugFlags=CPU_Bind,gres
+AccountingStorageType=accounting_storage/slurmdbd
+AccountingStorageHost={HOSTNAME}
+AccountingStoragePort=6819
+#AccountingStorageLoc=
+AccountingStoragePass=/var/run/munge/munge.socket.2
+AccountingStorageUser=slurm
+
+# POWER SAVE
+SuspendProgram={SHIPYARD_VAR_DIR}/suspend.sh
+ResumeProgram={SHIPYARD_VAR_DIR}/resume.sh
+ResumeFailProgram={SHIPYARD_VAR_DIR}/resume-fail.sh
+SuspendTime={IDLE_RECLAIM_TIME_SEC}
+SuspendRate=0
+ResumeRate=0
+SuspendTimeout=1200
+ResumeTimeout=1200
+#{SUSPEND_EXC_NODES}
+
+# TreeWidth must be at least as large as the maximum node count for cloud nodes
+TreeWidth={MAX_NODES}
+
+# GENERIC RESOURCES
+#{GRES_TYPES}
+
+# PARTITIONS AND NODES
+#{ADDITIONAL_NODES}
+#{ADDITIONAL_PARTITIONS}
--- a/slurm/slurm.py
+++ b/slurm/slurm.py
--- a/slurm/slurmdb.sql
+++ b/slurm/slurmdb.sql
@ -0,0 +1,6 @@
+create database slurm_acct_db;
+create user 'slurm'@'localhost';
+set password for 'slurm'@'localhost' = password('{SLURM_DB_PASSWORD}');
+grant usage on *.* to 'slurm'@'localhost';
+grant all privileges on slurm_acct_db.* to 'slurm'@'localhost';
+flush privileges;
--- a/slurm/slurmdbd.conf
+++ b/slurm/slurmdbd.conf
@ -0,0 +1,44 @@
+#
+# Example slurmdbd.conf file.
+#
+# See the slurmdbd.conf man page for more information.
+
+# Archive info
+#ArchiveJobs=yes
+#ArchiveDir="/tmp"
+#ArchiveSteps=yes
+#ArchiveScript=
+#JobPurge=12
+#StepPurge=1
+
+# Authentication info
+AuthType=auth/munge
+#AuthInfo=/var/run/munge/munge.socket.2
+
+# slurmDBD info
+DbdAddr={HOSTNAME}
+DbdHost={HOSTNAME}
+DbdPort=6819
+SlurmUser=slurm
+#MessageTimeout=300
+#DefaultQOS=normal,standby
+#PrivateData=accounts,users,usage,jobs
+#TrackWCKey=yes
+
+DebugLevel=4
+LogFile={SLURM_LOG_PATH}/slurmdbd-{HOSTNAME}.log
+
+#PidFile=/var/run/slurm-llnl/slurmdbd.pid
+#PluginDir=/usr/lib/x86_64-linux-gnu/slurm-wlm
+
+PidFile=/var/run/slurmdbd.pid
+PluginDir=/usr/lib/slurm
+
+# Database info
+StorageType=accounting_storage/mysql
+#StorageHost=localhost
+#StoragePort=1234
+StoragePass={SLURM_DB_PASSWORD}
+StorageUser=slurm
+StorageLoc=slurm_acct_db
+