This commit is contained in:
Brian Santacruz 2023-06-12 14:18:53 -07:00
Родитель 46f9d38eac
Коммит 60aa3d624e
2 изменённых файлов: 265 добавлений и 1 удалений

Просмотреть файл

@ -55,7 +55,7 @@ Duration: 50 minutes
In this exercise, you will learn how to deploy **Az-hop** utilizing a Linux machine.
### Task 1: Set up WSL2
#
To install **Az-hop** we require to have a Linux machine that could support the toolchain to use during the installation, it is preferred to use Ubuntu 20.04+. So, if you are running a Windows desktop you should use WSL2 with an Ubuntu 20.04 environment.
Run the following commands to install WSL2 (you will only need to do this once on your local device) in your Windows computer.

Просмотреть файл

@ -0,0 +1,264 @@
---
# yaml-language-server: $schema=config.schema.json
#
# name of the cluster
project_name: az-hop
# azure location name as returned by the command : az account list-locations -o table
location: westus2
# Name of the resource group to create all resources
resource_group: azhop-qinstall
# If using an existing resource group set to true. Default is false
# When using an existing resource group make sure the location match the one of the existing resource group
use_existing_rg: false
# If set to true, will disable telemetry for azhop. See https://azure.github.io/az-hop/deploy/telemetry.html.
#optout_telemetry: true
tags:
env: dev
project: azhop-quick-install
# Define an Azure Netapp Files (ANF) account, single pool and volume
# If not present, assume that there is an existing NFS share for the users home directory
anf:
create: true
# Size of the ANF pool and unique volume (min: 4TB, max: 100TB)
homefs_size_tb: 4
# Service level of the ANF volume, can be: Standard, Premium, Ultra
homefs_service_level: Standard
# dual protocol
dual_protocol: false # true to enable SMB support. false by default
# If alerting is enabled, this value will be used to determine when to trigger alerts
alert_threshold: 80 # alert when ANF volume reaches this threshold
# For small deployments you can use Azure Files instead of ANF for the home directory
azurefiles:
create: false
size_gb: 1024
# These mounts will be listed in the Files menu of the OnDemand portal and automatically mounted on all compute nodes and remote desktop nodes
mounts:
# mount settings for the user home directory
home: # This home name can't be changed
type: anf # anf or azurefiles, default to anf. One of the two should be defined in order to mount the home directory
mountpoint: /anfhome # /sharedhome for example
server: '{{anf_home_ip}}' # Specify an existing NFS server name or IP, when using the ANF built in use '{{anf_home_ip}}'
export: '{{anf_home_path}}' # Specify an existing NFS export directory, when using the ANF built in use '{{anf_home_path}}'
options: '{{anf_home_opts}}' # Specify the mount options. Default to rw,hard,rsize=262144,wsize=262144,vers=3,tcp,_netdev
# name of the admin account
admin_user: hpcadmin
# List of identities (object ids) to grant read access to az-hop key vault (optional)
# key_vault_readers:
# Network
network:
# Create Network and Application Security Rules, true by default, false when using an existing VNET if not specified
create_nsg: true
vnet:
name: hpcvnet # Optional - default to hpcvnet
#id: # If a vnet id is set then no network will be created and the provided vnet will be used
address_space: "10.0.0.0/23"
# When using an existing VNET, only the subnet names will be used and not the adress_prefixes
subnets: # all subnets are optionals
# name values can be used to rename the default to specific names, address_prefixes to change the IP ranges to be used
# All values below are the default values
frontend:
name: frontend
address_prefixes: "10.0.0.0/29"
create: true # create the subnet if true. default to true when not specified, default to false if using an existing VNET when not specified
admin:
name: admin
address_prefixes: "10.0.0.16/28"
create: true
netapp:
name: netapp
address_prefixes: "10.0.0.32/28"
create: true
ad:
name: ad
address_prefixes: "10.0.0.8/29"
create: true
compute:
name: compute
address_prefixes: "10.0.1.0/24"
create: true
# Specify the Application Security Groups mapping if already existing
# When working in a locked down network, uncomment and fill out this section
locked_down_network:
enforce: false
# grant_access_from: [a.b.c.d] # Array of CIDR to grant access from, see https://docs.microsoft.com/en-us/azure/storage/common/storage-network-security?tabs=azure-portal#grant-access-from-an-internet-ip-range
public_ip: true # Enable public IP creation for Jumpbox, OnDemand and create images. Default to true
# Base image configuration. Can be either an image reference or an image_id from the image registry or a custom managed image
linux_base_image: "OpenLogic:CentOS:7_9-gen2:latest" # publisher:offer:sku:version or image_id
# linux image plan if required, format is publisher:product:name
#linux_base_plan:
windows_base_image: "MicrosoftWindowsServer:WindowsServer:2019-Datacenter-smalldisk:latest" # publisher:offer:sku:version or image_id
# Jumpbox VM configuration, only needed when deploying thru a public IP and without a configured deployer VM
jumpbox:
vm_size: Standard_B2ms
# SSH port under which the jumpbox SSH server listens on the public IP. Default to 22
# Change this to, e.g., 2222, if security policies (like "zero trust") in your tenant automatically block access to port 22 from the internet
#ssh_port: 2222
# Active directory VM configuration
ad:
vm_size: Standard_B2ms
hybrid_benefit: false # Enable hybrid benefit for AD, default to false
high_availability: false # Build AD in High Availability mode (2 Domain Controlers) - default to false
# On demand VM configuration
ondemand:
vm_size: Standard_D4s_v5
#fqdn: azhop.foo.com # When provided it will be used for the certificate server name
generate_certificate: true # Generate an SSL certificate for the OnDemand portal. Default to true
# Grafana VM configuration
grafana:
vm_size: Standard_B2ms
# Guacamole VM configuration
guacamole:
vm_size: Standard_B2ms
# Scheduler VM configuration
scheduler:
vm_size: Standard_B2ms
# CycleCloud VM configuration
cyclecloud:
vm_size: Standard_B2ms
# version: 8.3.0-3062 # to specify a specific version, see https://packages.microsoft.com/yumrepos/cyclecloud/
# List of users to be created on this environment
users:
- { name: clusteradmin, uid: 10001, groups: [5001, 5002] }
- { name: hpcuser, uid: 10002 }
usergroups:
# These groups cant be changed
- name: Domain Users # All users will be added to this one by default
gid: 5000
- name: az-hop-admins
gid: 5001
description: "For users with azhop admin privileges"
- name: az-hop-localadmins
gid: 5002
description: "For users with sudo right or local admin right on nodes"
# scheduler to be installed and configured (openpbs, slurm)
queue_manager: openpbs
# Specific SLURM configuration
slurm:
# Enable SLURM accounting, this will create a SLURM accounting database in a managed MariaDB server instance
accounting_enabled: false
# SLURM version to install. Currently supported: only 20.11.9 and 22.05.3.
# Other versions can be installed by building from source (See build_rpms setting in the slurmserver role)
slurm_version: 20.11.9
# Name of the SLURM cluster for accounting (optional, default to 'slurm')
# WARNING: changing this value on a running cluster will cause slurmctld to fail to start. This is a
# safety check to prevent accounting errors. To override, remove /var/spool/slurmd/clustername
cluster_name: slurm_azhop
enroot:
enroot_version: 3.4.1
# Authentication configuration for accessing the az-hop portal
# Default is basic authentication. For oidc authentication you have to specify the following values
# The OIDCClient secret need to be stored as a secret named <oidc-client-id>-password in the keyvault used by az-hop
authentication:
httpd_auth: basic # oidc or basic
image_gallery:
create: true # Create the shared image gallery to store custom images
# Autoscale default settings for all queues, can be overriden on each queue depending on the VM type if needed
autoscale:
idle_timeout: 1800 # Idle time in seconds before shutting down VMs - default to 1800 like in CycleCloud
# List of queues (node arrays in Cycle) to be defined
# don't use queue names longer than 8 characters in order to leave space for node suffix, as hostnames are limited to 15 chars due to domain join and NETBIOS constraints.
queues:
- name: execute # name of the Cycle Cloud node array
# Azure VM Instance type
vm_size: Standard_F2s_v2
# maximum number of cores that can be instanciated
max_core_count: 1024
# Use the pre-built azhop image from the marketplace
image: azhpc:azhop-compute:centos-7_9:latest
# Set to true if AccelNet need to be enabled. false is the default value
EnableAcceleratedNetworking: false
# spot instance support. Default is false
spot: false
# Set to false to disable creation of placement groups (for SLURM only). Default is true
ColocateNodes: false
# Specific idle time in seconds before shutting down VMs, make sure it's lower than autoscale.idle_timeout
idle_timeout: 300
# Set the max number of vm's in a VMSS; requires additional limit raise through support ticket for >100;
# 100 is default value; lower numbers will improve scaling for single node jobs or jobs with small number of nodes
MaxScaleSetSize: 100
- name: hc44rs
vm_size: Standard_HC44rs
max_core_count: 440
image: azhpc:azhop-compute:centos-7_9:latest
spot: true
EnableAcceleratedNetworking: true
- name: hb120v2
vm_size: Standard_HB120rs_v2
max_core_count: 1200
image: azhpc:azhop-compute:centos-7_9:latest
spot: true
EnableAcceleratedNetworking: true
- name: hb120v3
vm_size: Standard_HB120rs_v3
max_core_count: 1200
image: azhpc:azhop-compute:centos-7_9:latest
spot: true
EnableAcceleratedNetworking: true
# Queue dedicated to GPU remote viz nodes. This name is fixed and can't be changed
- name: viz3d
vm_size: Standard_NV12s_v3
max_core_count: 48
# Use the pre-built azhop image from the marketplace
image: azhpc:azhop-desktop:centos-7_9:latest
# Use this image ID when building your own custom images
#image: /subscriptions/{{subscription_id}}/resourceGroups/{{resource_group}}/providers/Microsoft.Compute/galleries/{{sig_name}}/images/azhop-centos79-desktop3d/latest
ColocateNodes: false
spot: false
EnableAcceleratedNetworking: true
max_hours: 12 # Maximum session duration
min_hours: 1 # Minimum session duration - 0 is infinite
# Queue dedicated to share GPU remote viz nodes. This name is fixed and can't be changed
- name: largeviz3d
vm_size: Standard_NV48s_v3
max_core_count: 96
image: azhpc:azhop-desktop:centos-7_9:latest
ColocateNodes: false
EnableAcceleratedNetworking: true
spot: false
max_hours: 12
min_hours: 1
# Queue dedicated to non GPU remote viz nodes. This name is fixed and can't be changed
- name: viz
vm_size: Standard_D8s_v5
max_core_count: 200
image: azhpc:azhop-desktop:centos-7_9:latest
ColocateNodes: false
spot: false
EnableAcceleratedNetworking: true
max_hours: 12
min_hours: 1
# Application settings
applications:
bc_codeserver:
enabled: true
bc_jupyter:
enabled: true
bc_amlsdk:
enabled: false
bc_rstudio:
enabled: false
bc_ansys_workbench:
enabled: false
bc_vmd:
enabled: false
bc_paraview:
enabled: true
bc_vizer:
enabled: false