az-hop/config.tpl.yml

600 строки
25 KiB
YAML

---
# yaml-language-server: $schema=config.schema.json
# name of the cluster
project_name: az-hop
# azure location name as returned by the command : az account list-locations -o table
location: westeurope
# Name of the resource group to create all resources
resource_group: azhop
# If using an existing resource group set to true. Default is false
# When using an existing resource group make sure the location match the one of the existing resource group
use_existing_rg: false
# If set to true, will disable telemetry for azhop. See https://azure.github.io/az-hop/deploy/telemetry.html.
#optout_telemetry: true
# A log analytics workspace can be created for monitoring or alerting
# Alternatively, you can use an existing workspace.
# To use an existing workspace set create to false and specify the resource group, name and subscription the target workspace lives in
log_analytics:
create: false
# An existing log analytics workspace can be used instead. The resource group, name and subscription id of the workspace will need to be specified.
# Grant the role "Log Analytics Contributor" on the target Log Analytics Workspace for the identity used to deploy az-hop
#resource_group:
#name:
#subscription_id: # Optional, if not specified the current subscription will be used
# Enable the assignment of a managed identity to compute VMs managed by CycleCloud.
# You can create an identity and optionally specify a name for the identity to be created or use an existing identity.
compute_vm_identity:
create: false
# An existing user assigned identity can be used instead.
# The name & resource group of the target identity will need to be specified if using an existing identity.
# If you opt for creating an identity, you can specify a name for the identity to be created, but do not need to specify the resource group.
#name:
#resource_group:
# Option to install the monitoring agent on static infra VMs. Can be disabled if the agent is installed by policy.
monitoring:
azure_monitor_agent: false # Install Azure Monitor Agent on static infra VMs
# Optional settings to deploy Grafana and install Telegraf
telegraf: true # Install telegraf on static infra VMs and dynamic compute nodes. Default: true
grafana: true # Deploy a Grafana instance with pre-defined dashboards. Default: true
idle_threshold: '70' # default threshold to highlight idle VMs in grafana cluster stats dashboard
mem_threshold: '30' # default threshold to highlight VMs running out of memory in grafana cluster stats dashboard
iowait_threshold: '40' # default threshold to highlight VMs waiting on IO in grafana cluster stats dashboard
#If set to true, it will create alert rules associated with az-hop. Enablement of alerting will require the specification of an admin email to send alerts to.
alerting:
enabled: false
admin_email: admin.mail@contoso.com
local_volume_threshold: 80
# Additional tags to be added on the Resource Group
tags:
env: dev
project: azhop
# Define an Azure Netapp Files (ANF) account, single pool and volume
# If not present, assume that there is an existing NFS share for the users home directory
anf:
create: true
# Size of the ANF pool and unique volume (min: 4TB, max: 100TB)
homefs_size_tb: 4
# Service level of the ANF volume, can be: Standard, Premium, Ultra
homefs_service_level: Standard
# dual protocol
dual_protocol: false # true to enable SMB support. false by default
# If alerting is enabled, this value will be used to determine when to trigger alerts
alert_threshold: 80 # alert when ANF volume reaches this threshold
# For small deployments you can use Azure Files instead of ANF for the home directory
azurefiles:
create: false
size_gb: 1024
# These mounts will be listed in the Files menu of the OnDemand portal and automatically mounted on all compute nodes and remote desktop nodes
mounts:
# mount settings for the user home directory
home: # This home name can't be changed
# type of mount : existing, anf or azurefiles, default to existing. One of the three should be defined in order to mount the home directory
# When using existing, the mountpoint, server, export and options should be defined, for other cases leave the values as defined with the curly braces
type: anf
mountpoint: /anfhome # /sharedhome for example
server: '{{anf_home_ip}}' # Specify an existing NFS server name or IP, when using the ANF built in use '{{anf_home_ip}}'
export: '{{anf_home_path}}' # Specify an existing NFS export directory, when using the ANF built in use '{{anf_home_path}}'
options: '{{anf_home_opts}}' # Specify the mount options. Default to rw,hard,rsize=262144,wsize=262144,vers=3,tcp,_netdev
# mount1:
# mountpoint: /mount1
# server: a.b.c.d # Specify an existing NFS server name or IP
# export: myexport1 # Specify an existing NFS export name
# options: my_options # Specify the mount options.
# name of the admin account
admin_user: hpcadmin
# List of identities (object ids) to grant read access to az-hop key vault (optional)
# key_vault_readers:
# Network
network:
# Create Network and Application Security Rules, true by default, false when using an existing VNET if not specified
create_nsg: true
vnet:
name: hpcvnet # Optional - default to hpcvnet
#id: # If a vnet id is set then no network will be created and the provided vnet will be used
address_space: "10.0.0.0/23"
# Special VNET Tags
# tags:
# key1: value1
# When using an existing VNET, only the subnet names will be used and not the address_prefixes
subnets: # all subnets are optionals
# name values can be used to rename the default to specific names, address_prefixes to change the IP ranges to be used
# All values below are the default values
frontend:
name: frontend
address_prefixes: "10.0.0.0/29"
create: true # create the subnet if true. default to true when not specified, default to false if using an existing VNET when not specified
ad:
name: ad
address_prefixes: "10.0.0.8/29"
create: true
admin:
name: admin
address_prefixes: "10.0.0.16/28"
create: true
netapp:
name: netapp
address_prefixes: "10.0.0.32/28"
create: true
# the outbounddns is optional and only when deploying an Azure Private DNS Resolver
# outbounddns:
# name: outbounddns
# address_prefixes: "10.0.0.48/28"
# create: true
# Bastion and Gateway subnets are optional and can be added if a Bastion or a VPN need to be created in the environment
# bastion: # Bastion subnet name is always fixed to AzureBastionSubnet
# address_prefixes: "10.0.0.64/26" # CIDR minimal range must be /26
# create: true
# lustre: # Lustre subnet when deploying a lustre filesystem
# address_prefixes: "10.0.0.128/26"
# create: true
# gateway: # Gateway subnet name is always fixed to GatewaySubnet
# address_prefixes: "10.0.0.192/27" # Recommendation is to use /27 or /26 network
# create: true
# database: # database subnet, has to be enabled when using a managed mysql instance for slurm accounting
# address_prefixes: "10.0.0.224/28"
# create: true
compute:
name: compute
address_prefixes: "10.0.1.0/24"
create: true
# Specify the Application Security Groups mapping if already existing
# asg:
# resource_group: # name of the resource group containing the ASG. Default to the resource group containing azhop resources
# names: # list of ASG names mapping to the one defined in az-hop
# asg-ssh: asg-ssh
# asg-rdp: asg-rdp
# asg-jumpbox: asg-jumpbox
# asg-ad: asg-ad
# asg-ad-client: asg-ad-client
# asg-lustre-client: asg-lustre-client
# asg-pbs: asg-sched
# asg-cyclecloud: asg-cyclecloud
# asg-cyclecloud-client: asg-cyclecloud-client
# asg-nfs-client: asg-nfs-client
# asg-telegraf: asg-telegraf
# asg-grafana: asg-grafana
# asg-ondemand: asg-ondemand
# asg-deployer: asg-deployer
# asg-mysql-client: asg-mysql-client
# peering: # This list is optional, and can be used to create VNet Peerings in the same subscription.
# - vnet_name: #"VNET Name to Peer to"
# vnet_resource_group: #"Resource Group of the VNET to peer to"
# vnet_allow_gateway: false # optional: allow gateway transit (default: true)
# Specify DNS forwarders available in the network
# dns:
# forwarders:
# - { name: foo.com, ips: "10.2.0.4, 10.2.0.5" }
# When working in a locked down network, uncomment and fill out this section
locked_down_network:
enforce: false
# grant_access_from: [a.b.c.d] # Array of CIDR to grant access from, see https://docs.microsoft.com/en-us/azure/storage/common/storage-network-security?tabs=azure-portal#grant-access-from-an-internet-ip-range
public_ip: true # Enable public IP creation for Jumpbox, OnDemand and create images. Default to true
# NAT Gateway is used for outbound internet connectivity
nat_gateway:
create: true
#name: foo # name to be used instead of the default one created for you
# Base image configuration. Can be either an image reference or an image_id from the image registry or a custom managed image
# For AlmaLinux 8.7 use almalinux:almalinux-x86_64:8_7-gen2:latest
# For CentOS 7.9 use OpenLogic:CentOS:7_9-gen2:latest
linux_base_image: "OpenLogic:CentOS:7_9-gen2:latest" # publisher:offer:sku:version or image_id
# linux image plan if required, format is publisher:product:name
#linux_base_plan:
windows_base_image: "MicrosoftWindowsServer:WindowsServer:2019-Datacenter-smalldisk:latest" # publisher:offer:sku:version or image_id
private_dns:
create: false # Create a private DNS zone for the environment. Default to false
name: hpc.azure # Name of the private DNS zone to be created. Default to hpc.azure
registration_enabled: false # Enable auto-registration of VMs in the private DNS zone. Default to false
domain:
name: "hpc.azure"
#domain_join_ou: "OU=azhop" # OU to set the machine in. Make sure the OU exists in the domain as it won't be created for you
use_existing_dc: false # Set to true if you want to join a domain with existing DC
domain_join_user:
username: hpcadmin
password_key_vault_name: name_for_the_key_vault_with_the_domain_join_password
password_key_vault_resource_group_name: resource_group_name_for_the_key_vault_with_the_domain_join_password
password_key_vault_secret_name: key_vault_secret_name_for_the_domain_join_password
# additional settings when using an existinf DC
existing_dc_details:
domain_controller_names: ["dc1", "dc2"]
domain_controller_ip_addresses: ["192.168.1.100", "192.168.1.101"]
private_dns_servers: ["192.168.1.53", "192.168.2.53"]
# Optional: name of the key vault resource to be created. If not provided, a name will be generated
# azure_key_vault:
# name: custom_key_vault_name
# Optional: name of the storage account to be created.
# azure_storage_account:
# name: custom_storage_account_name
# Jumpbox VM configuration, only needed when deploying thru a public IP and without a configured deployer VM
jumpbox:
name: jumpbox
vm_size: Standard_B2ms
# SSH port under which the jumpbox SSH server listens on the public IP. Default to 22
# Change this to, e.g., 2222, if security policies (like "zero trust") in your tenant automatically block access to port 22 from the internet
#ssh_port: 2222
# Active directory VM configuration
ad:
name: ad
vm_size: Standard_B2ms
hybrid_benefit: false # Enable hybrid benefit for AD, default to false
high_availability: false # Build AD in High Availability mode (2 Domain Controllers) - default to false
ha_name: ad2 # name of the HA AD machine when high_availability=true
# On demand VM configuration
ondemand:
name: ondemand
vm_size: Standard_D4s_v5
#fqdn: azhop.foo.com # When provided it will be used for the certificate server name
generate_certificate: true # Generate an SSL certificate for the OnDemand portal. Default to true
#whitelist_path: "{{mounts.home.mountpoint}}:/tmp" # Optional: colon-separated list of paths allowed in the file browser
#file_upload_max: 2048 # Maximum file upload size in bytes. Optional, default: 10GB
#file_upload_allowed_extensions: [".txt", ".log"] # List of allowed file extensions. Default: all files are allowed
# Grafana VM configuration
grafana:
name: grafana
vm_size: Standard_B2ms
# Scheduler VM configuration
scheduler:
name: scheduler
vm_size: Standard_B2ms
# CycleCloud VM configuration
cyclecloud:
name: ccportal
vm_size: Standard_B2ms
# version: # to specify a specific version, see https://packages.microsoft.com/yumrepos/cyclecloud/
# Optional: use Ubuntu for the CycleCloud VM (default: linux_base_image)
# image: "canonical:0001-com-ubuntu-server-focal:20_04-lts-gen2:latest"
# plan: publisher:product:name
# Change this to, e.g., 2222, if security policies (like "zero trust") in your tenant automatically block access to port 22 from the internet
# ssh_port: 2222
# use_as_deployer: true # default to false - to be used when deploying without a deployer VM
# web_server_path: /cyclecloud # Override the CycleCloud Web Server Context Path - default to /cyclecloud
# Lustre cluster is optional and can be used to create a Lustre cluster in the environment.
lustre:
create: false # true or false to create a lustre cluster
# Options: AMLFS-Durable-Premium-40, AMLFS-Durable-Premium-125, AMLFS-Durable-Premium-250, AMLFS-Durable-Premium-500
# See documentation for more details: https://learn.microsoft.com/en-us/azure/azure-managed-lustre/create-file-system-resource-manager
sku: AMLFS-Durable-Premium-250
# The step sizes are dependent on the SKU.
# - AMLFS-Durable-Premium-40: 48TB
# - AMLFS-Durable-Premium-125: 16TB
# - AMLFS-Durable-Premium-250: 8TB
# - AMLFS-Durable-Premium-500: 4TB
capacity: 8
# optional to use existing storage for the archive
# if not included it will use the azhop storage account that is created
# hsm:
# storage_account: #existing_storage_account_name
# storage_container: #only_used_with_existing_storage_account
# List of users to be created on this environment
users:
# name: username - must be less than 20 characters
# uid: uniqueid
# shell: /bin/bash # default to /bin/bash
# home: /anfhome/<user_name> # default to /homedir_mountpoint/user_name
# groups: list of groups the user belongs to
- { name: clusteradmin, uid: 10001, groups: [5001, 5002] }
- { name: hpcuser, uid: 10002 }
# - { name: user1, uid: 10003, groups: [6000] }
# - { name: user2, uid: 10004, groups: [6001] }
usergroups:
# These group names could be changed but not the gids as names will be mapped by gids
- name: azhop-users # All users will be added to this group by default
gid: 5000
- name: azhop-admins
gid: 5001
description: "For users with azhop admin privileges"
- name: azhop-localadmins
gid: 5002
description: "For users with sudo right or local admin right on nodes"
# For custom groups use gid >= 6000
# - name: project1 # For project1 users
# gid: 6000
# - name: project2 # For project2 users
# gid: 6001
# Enable cvmfs-eessi - disabled by default
cvmfs_eessi:
enabled: false
# scheduler to be installed and configured (openpbs, slurm)
queue_manager: openpbs
# Specific SLURM configuration
slurm:
# Enable SLURM accounting, this will create a SLURM accounting database in a managed mysql server instance
accounting_enabled: false
# SLURM version to install. Currently supported: only [20.11.9 | 22.05.3] with cyclecloud_slurm_version==2.7.x and [22.05.9 | 23.02.5] with cyclecloud_slurm_version==3.0.x.
slurm_version: 22.05.9
# CycleCloud for SLURM project version as defined in https://github.com/Azure/cyclecloud-slurm/releases. Currently supported: 2.7.0-2.7.2, 3.0.4.
cyclecloud_slurm_version: 3.0.4
# Name of the SLURM cluster for accounting (optional, default to 'slurm')
# WARNING: changing this value on a running cluster will cause slurmctld to fail to start. This is a
# safety check to prevent accounting errors. To override, remove /var/spool/slurmd/clustername
cluster_name: slurm_azhop
enroot:
enroot_version: 3.4.1
#database:
# Resource name of the mysql instance to be created
#name: custom_mysql_name
# If using an existing Managed mysql instance for SLURM accounting, specify these values
# Admin user of the database for which the password will be retrieved from the azhop keyvault
#user: sqladmin
# FQDN of the existing managed instance
#fqdn:
# IP of the managed private endpoint if the FQDN is not registered in a private DNS
#ip:
# Create a Bastion in the bastion subnet when defined
bastion:
create: false
# Create a VPN Gateway in the gateway subnet when specified
vpn_gateway:
create: false
# Authentication configuration for accessing the az-hop portal
# Default is basic authentication. For oidc authentication you have to specify the following values
# The OIDCClient secret need to be stored as a secret named <oidc-client-id>-password in the keyvault used by az-hop
authentication:
user_auth: ad # local or ad - default to ad. Local will create local users and users groups on all infrastructure VMs and dynamically on dynamic nodes
httpd_auth: basic # oidc or basic
# User mapping https://osc.github.io/ood-documentation/latest/reference/files/ood-portal-yml.html#ood-portal-generator-user-map-match
# You can specify either a map_match or a user_map_cmd
# Domain users are mapped to az-hop users with the same name and without the domain name
# user_map_match: '^([^@]+)@mydomain.foo$'
# If using a custom mapping script, update it from the ./playbooks/files directory before running the playbook
# user_map_cmd: /opt/ood/ood_auth_map/bin/custom_mapping.sh
# ood_auth_openidc:
# OIDCProviderMetadataURL: # for AAD use 'https://sts.windows.net/{{tenant_id}}/.well-known/openid-configuration'
# OIDCClientID: 'XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX'
# OIDCRemoteUserClaim: # for AAD use 'unique_name'
# OIDCScope: # for AAD use 'openid profile email groups'
# OIDCPassIDTokenAs: # for AAD use 'serialized'
# OIDCPassRefreshToken: # for AAD use 'On'
# OIDCPassClaimsAs: # for AAD use 'environment'
# OIDCStateMaxNumberOfCookies: # in case of too many connections error for the same user set this to [10, true]
image_gallery:
create: false # Create the shared image gallery to store custom images
# List of images to be defined
images:
# - name: image_definition_name # Should match the packer configuration file name, one per packer file
# publisher: azhop
# offer: CentOS
# sku: 7_9-gen2
# hyper_v: V2 # V1 or V2 (V1 is the default)
# os_type: Linux # Linux or Windows
# version: 7.9 # Version of the image to create the image definition in SIG. Pattern is major.minor where minor is mandatory
# Pre-defined images
- name: azhop-compute-almalinux-8_7
publisher: azhop
offer: almalinux
sku: 8_7-hpc-gen2
hyper_v: V2
os_type: Linux
version: 8.7
- name: azhop-centos79-v2-rdma-gpgpu
publisher: azhop
offer: CentOS
sku: 7.9-gen2
hyper_v: V2
os_type: Linux
version: 7.9
# Image definition when using a custom image to build compute nodes images
- name: azhop-centos79-v2-rdma-ci
publisher: azhop
offer: CentOS
sku: 7.9-gen2-ci
hyper_v: V2
os_type: Linux
version: 7.9
# Image definition when using a custom image to build remote viz nodes images
- name: azhop-centos79-desktop3d-ci
publisher: azhop
offer: CentOS
sku: 7.9-gen2-desktop3d-ci
hyper_v: V2
os_type: Linux
version: 7.9
- name: azhop-centos79-desktop3d
publisher: azhop
offer: CentOS
sku: 7.9-gen2-desktop3d
hyper_v: V2
os_type: Linux
version: 7.9
- name: azhop-compute-centos-7_9
publisher: azhpc
offer: azhop-compute
sku: centos-7_9
hyper_v: V2
os_type: Linux
version: 7.9
- name: azhop-desktop-centos-7_9
publisher: azhpc
offer: azhop-desktop
sku: centos-7_9
hyper_v: V2
os_type: Linux
version: 7.9
- name: azhop-desktop-ubuntu-20_04
publisher: azhpc
offer: azhop-desktop
sku: ubuntu-2004
hyper_v: V2
os_type: Linux
version: 20.04
- name: azhop-win10
publisher: azhop
offer: Windows-10
sku: 21h1-pron
hyper_v: V1
os_type: Windows
version: 10.19043
# Base image when building your own HPC image and not using the HPC marketplace images
- name: base-centos79-v2-rdma
publisher: azhop
offer: CentOS
sku: 7.9-gen2-rdma-nogpu
hyper_v: V2
os_type: Linux
version: 7.9
- name: base-ubuntu-20_04-v2-rdma-gpgpu
publisher: azhop
offer: 0001-com-ubuntu-server-focal
sku: 20_04-lts-gen2-rdma-gpgpu
hyper_v: V2
os_type: Linux
version: 20.04
- name: azhop-ubuntu-20_04-v2-rdma-gpgpu-ci
publisher: azhop
offer: 0001-com-ubuntu-server-focal
sku: 20_04-lts-gen2-rdma-gpgpu-ci
hyper_v: V2
os_type: Linux
version: 20.04
- name: azhop-ubuntu-20_04-desktop3d-ci
publisher: azhop
offer: 0001-com-ubuntu-server-focal
sku: 20_04-lts-gen2-desktop3d-ci
hyper_v: V2
os_type: Linux
version: 20.04
# Autoscale default settings for all queues, can be overridden on each queue depending on the VM type if needed
autoscale:
idle_timeout: 1800 # Idle time in seconds before shutting down VMs - default to 1800 like in CycleCloud
# List of queues (node arrays in Cycle) to be defined
# don't use queue names longer than 8 characters in order to leave space for node suffix, as hostnames are limited to 15 chars due to domain join and NETBIOS constraints.
queues:
- name: htc # name of the Cycle Cloud node array
# Azure VM Instance type
vm_size: Standard_F2s_v2
# maximum number of cores that can be instanciated
max_core_count: 1024
# maximum number of nodes that can be instanciated. The lower of the two will be used
max_count: 128
# Use the pre-built azhop image from the marketplace
image: azhpc:azhop-compute:centos-7_9:latest
# Use this image ID when building your own custom images
#image: /subscriptions/{{subscription_id}}/resourceGroups/{{resource_group}}/providers/Microsoft.Compute/galleries/{{sig_name}}/images/azhop-centos79-v2-rdma-gpgpu/latest
# Image plan specification (when needed for the image). Terms must be accepted prior to deployment
# plan: publisher:product:name
# Set to true if AccelNet need to be enabled. false is the default value
EnableAcceleratedNetworking: false
# spot instance support. Default is false
spot: false
# Set to false to disable creation of placement groups (for SLURM only). Default is true
ColocateNodes: false
# Specific idle time in seconds before shutting down VMs, make sure it's lower than autoscale.idle_timeout
idle_timeout: 300
# Set the max number of vm's in a VMSS; requires additional limit raise through support ticket for >100;
# 100 is default value; lower numbers will improve scaling for single node jobs or jobs with small number of nodes
MaxScaleSetSize: 100
# Use ephemeral boot disk for VM, check here for supported family https://learn.microsoft.com/en-us/azure/virtual-machines/ephemeral-os-disks
EphemeralOSDisk: false
# Whether to enable healthchecks. Defaults to true
healthchecks: false
- name: hpc
vm_size: Standard_HB120rs_v3
max_count: 10
image: azhpc:azhop-compute:centos-7_9:latest
EnableAcceleratedNetworking: true
- name: gpu
vm_size: Standard_NC24ads_A100_v4
max_count: 0
image: azhpc:azhop-compute:centos-7_9:latest
EnableAcceleratedNetworking: true
# Queues for remote visualization
- name: viz3d
type: remoteviz # mandatory property to specify the queue is for remote viz nodes
description: "With GPU - Small GPU node for single session" # optional description that will appear in the node selection choice of the ood portal
vm_size: Standard_NV12s_v3
max_count: 4
# Use the pre-built azhop image from the marketplace
image: azhpc:azhop-desktop:centos-7_9:latest
# Use this image ID when building your own custom images
#image: /subscriptions/{{subscription_id}}/resourceGroups/{{resource_group}}/providers/Microsoft.Compute/galleries/{{sig_name}}/images/azhop-centos79-desktop3d/latest
ColocateNodes: false
EnableAcceleratedNetworking: true
max_hours: 12 # Maximum session duration
min_hours: 1 # Minimum session duration - 0 is infinite
- name: largeviz3d
type: remoteviz
description: "Large With GPU - Intended for shared sessions"
shareable: true # Set to true to allow multiple users to connect to the same node - false by default
vm_size: Standard_NV48s_v3
max_count: 2
image: azhpc:azhop-desktop:centos-7_9:latest
ColocateNodes: false
EnableAcceleratedNetworking: true
max_hours: 12
min_hours: 1
- name: viz
type: remoteviz
description: "Without GPU - for single session"
vm_size: Standard_D8s_v5
max_count: 10
image: azhpc:azhop-desktop:centos-7_9:latest
ColocateNodes: false
EnableAcceleratedNetworking: true
max_hours: 12
min_hours: 1
# Login nodes
- name: login
type: login
vm_size: Standard_F2s_v2
initial_count: 1
max_count: 1
image: azhpc:azhop-compute:centos-7_9:latest
ColocateNodes: false
EnableAcceleratedNetworking: true
# Application settings
applications:
bc_codeserver:
enabled: true
bc_jupyter:
enabled: true
bc_amlsdk:
enabled: false
bc_rstudio:
enabled: false
bc_ansys_workbench:
enabled: false
bc_vmd:
enabled: false
bc_paraview:
enabled: false
bc_vizer:
enabled: false