зеркало из https://github.com/Azure/azurehpc.git
Merge pull request #298 from Azure/hackathon_june_2020
CycleCloud integration
This commit is contained in:
Коммит
2c31608b11
68
README.md
68
README.md
|
@ -27,8 +27,10 @@ The JSON file is composed of the following:
|
|||
* Variables dictionary
|
||||
* Setup information
|
||||
* Network dictionary
|
||||
* Storage dictionary
|
||||
* Resources dictionary
|
||||
* Install list
|
||||
* CycleCloud dictionary
|
||||
|
||||
> Note : for the full config structure file see [config.json](https://github.com/Azure/azurehpc/tree/master/config.json)
|
||||
|
||||
|
@ -38,6 +40,25 @@ This allows variables to be created and used throughout the config file (see how
|
|||
|
||||
> When creating templates for others to use the value should be `<NOT-SET>` so the `azhpc-*` commands will notify the user.
|
||||
|
||||
To allow a better reuse of variables across several configuration files the variable section can reference a file like this :
|
||||
```json
|
||||
{
|
||||
"variables": "@variables.json"
|
||||
}
|
||||
```
|
||||
The referenced file should contains variables this way :
|
||||
|
||||
```json
|
||||
{
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"hpc_image": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"location": "westeurope",
|
||||
"resource_group": "my resource group",
|
||||
"vm_type": "Standard_HB60rs",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
}
|
||||
```
|
||||
|
||||
### Setup information
|
||||
|
||||
The following properties are global :
|
||||
|
@ -111,7 +132,9 @@ This dictionary desribes the storage resources to be created. Today only Azure N
|
|||
|
||||
| Name | Description | Required | Default |
|
||||
|--------------------|------------------------------------------------------------------------------------|----------|---------|
|
||||
| **type** | Type of storage - has to be set to `anf` | yes | |
|
||||
| **type** | Type of storage - can be `anf, storageaccount` | yes | |
|
||||
| **sku** | Only for `type=storageaccount` can be `Standard_LRS, Standard_GRS, Standard_RAGRS, Standard_ZRS, Premium_LRS, Premium_ZRS, Standard_GZRS, Standard_RAGZRS` | yes | |
|
||||
| **containers** | Only for `type=storageaccount`. Array of containers name to create | no | |
|
||||
| **subnet** | Subnet name in which to inject ANF NICs | yes | |
|
||||
| **joindomain** | Domain name to join to | no | |
|
||||
| **ad_server** | Domain Server to connect to | no | |
|
||||
|
@ -127,7 +150,7 @@ This dictionary describes the ANF pools to be created
|
|||
|--------------------|------------------------------------------------------------------------------------|----------|---------|
|
||||
| **service_level** | Service Level - can be `Ultra, Premium, or Standard` | yes | |
|
||||
| **size** | Total pool size in TB. From 4 to 100 | yes | |
|
||||
| **volumes** | Dictionary of [ANF volumes](anf-volumes-dictionary) in that pool | yes | |
|
||||
| **volumes** | Dictionary of [ANF volumes](#anf-volumes-dictionary) in that pool | yes | |
|
||||
|
||||
|
||||
##### ANF Volumes dictionary
|
||||
|
@ -187,12 +210,51 @@ This describes the steps to install after all the resources have been provisione
|
|||
| **script** | The name of the script to run | yes | |
|
||||
| **tag** | The tag to select which resources will run this step | yes | |
|
||||
| **sudo** | Boolean flag for whether to run the script with sudo | no | False |
|
||||
| **deps** | A list of dependent scripts to be copied on the `install_from` VM as well | no | |
|
||||
| **deps** | A list of dependent files to be copied on the `install_from` VM as well | no | |
|
||||
| **args** | A list containing the arguments for the script | no | |
|
||||
| **copy** | This is a list of files to copy to each resource from the `install_from` VM and assumes the file will have been downloaded as a previous step | no | |
|
||||
|
||||
> Note: the script to run be the path relative to either the `$azhpc_dir/scripts` or a local `scripts` directory for the project. The local directory will take precedence over the `$azhpc_dir/scripts`.
|
||||
|
||||
### Cycle Cloud dictionary
|
||||
|
||||
This describes the CycleCloud clusters configuration and projects to be uploaded to the current CycleCloud installation.
|
||||
| Name | Description | Required | Default |
|
||||
|-----------------|--------------------------------------------------------------------------------------|----------|---------|
|
||||
| **clusters** | Dictionary of [CycleCloud Clusters](#cyclecloud_cluster_dictionary) parameters | yes | |
|
||||
| **projects** | Dictionary of [CycleCloud projects](#cyclecloud_projects_dictionary) init scripts | yes | |
|
||||
|
||||
#### CycleCloud cluster dictionary
|
||||
This describes the template and parameters to be applied on a CycleCloud cluster.
|
||||
| Name | Description | Required | Default |
|
||||
|------------|--------------------------------------------------------------------------------------|----------|---------|
|
||||
| **template** | The name of the template used to create the cluster. | yes | |
|
||||
| **parameters** | Dictionary of parameters defined in the template. The parameter list can be retrieved with the [cyclecloud export_parameters](https://docs.microsoft.com/en-us/azure/cyclecloud/cli?view=cyclecloud-7#cyclecloud-export_parameters) command | yes | |
|
||||
|
||||
For `ClusterInitSpec`definition use thie following format, and make sure to use the same spec name in the projects dictionary
|
||||
```json
|
||||
"xxxClusterInitSpecs": {
|
||||
"projectname:specname:i.j.k": {
|
||||
"Order": 10000,
|
||||
"Name": "projectname:specname:i.j.k",
|
||||
"Spec": "specname",
|
||||
"Project": "projectname",
|
||||
"Version": "i.j.k",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### CycleCloud projects dictionary
|
||||
This describes the CycleCloud project containing cluster-init wrapper scripts to be uploaded in the CycleCloud locker.
|
||||
Each project name has to follow the CycleCloud naming convention `projectname:specname:i.j.k` and contains an array of scripts described below :
|
||||
|
||||
| Name | Description | Required | Default |
|
||||
|------------|--------------------------------------------------------------------------------------|----------|---------|
|
||||
| **script** | The name of the script to run | yes | |
|
||||
| **deps** | A list of dependent files | no | |
|
||||
| **args** | A list containing the arguments for the script | no | |
|
||||
|
||||
|
||||
### Macros in the config file
|
||||
|
||||
|
|
|
@ -72,7 +72,7 @@ hostlist=$(pwd)/hosts.$PBS_JOBID
|
|||
|
||||
sort -u $PBS_NODEFILE > $hostlist
|
||||
# remove .internal.cloudapp.net from node names
|
||||
sed -i 's/.internal.cloudapp.net//g' $hostlist
|
||||
#sed -i 's/.internal.cloudapp.net//g' $hostlist
|
||||
|
||||
case $MODE in
|
||||
ring) # one to neighbour
|
||||
|
|
|
@ -3,14 +3,11 @@ APP_NAME=ior
|
|||
SHARED_APP=${SHARED_APP:-/apps}
|
||||
MODULE_DIR=${SHARED_APP}/modulefiles
|
||||
MODULE_NAME=${APP_NAME}
|
||||
INSTALL_DIR=${SHARED_APP}/${APP_NAME}
|
||||
PARALLEL_BUILD=8
|
||||
IOR_VERSION=3.2.1
|
||||
|
||||
sudo yum install -y jq
|
||||
INSTALL_DIR=${SHARED_APP}/${APP_NAME}-$IOR_VERSION
|
||||
|
||||
source /etc/profile.d/modules.sh # so we can load modules
|
||||
# GCC 8 is no longer provided with CentOS-HPC 7.7 image, it is now 9.2, but is this really needed ?
|
||||
module load gcc-9.2.0
|
||||
|
||||
AZHPC_VMSIZE=$(curl -s -H Metadata:true "http://169.254.169.254/metadata/instance?api-version=2018-10-01" | jq -r '.compute.vmSize')
|
||||
|
@ -36,9 +33,10 @@ function create_modulefile {
|
|||
mkdir -p ${MODULE_DIR}
|
||||
cat << EOF > ${MODULE_DIR}/${MODULE_NAME}
|
||||
#%Module
|
||||
prepend-path PATH ${INSTALL_DIR}/bin;
|
||||
prepend-path LD_LIBRARY_PATH ${INSTALL_DIR}/lib;
|
||||
prepend-path MAN_PATH ${INSTALL_DIR}/share/man;
|
||||
prepend-path PATH ${INSTALL_DIR}/bin;
|
||||
prepend-path LD_LIBRARY_PATH ${INSTALL_DIR}/lib;
|
||||
prepend-path MAN_PATH ${INSTALL_DIR}/share/man;
|
||||
setenv IOR_BIN ${INSTALL_DIR}/bin
|
||||
EOF
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#!/bin/bash
|
||||
FILESYSTEM=${FILESYSTEM:-/data}
|
||||
SHARED_APP=${SHARED_APP:-/apps}
|
||||
FILESYSTEM=${1:-/data}
|
||||
SHARED_APP=${2:-/apps}
|
||||
|
||||
source /etc/profile # so we can load modules
|
||||
|
||||
|
@ -29,10 +29,10 @@ CORES=$(cat $PBS_NODEFILE | wc -l)
|
|||
NODES=$(cat $PBS_NODEFILE | sort -u)
|
||||
|
||||
# Throughput test (N-N)
|
||||
mpirun -bind-to hwthread -np $CORES --hostfile $PBS_NODEFILE ${SHARED_APP}/ior/bin/ior -a POSIX -v -i 3 -m -d 1 -B -e -F -r -w -t 32m -b 4G -o ${FILESYSTEM}/test.$(date +"%Y-%m-%d_%H-%M-%S") -O summaryFormat=JSON
|
||||
mpirun -bind-to hwthread -np $CORES --hostfile $PBS_NODEFILE $IOR_BIN/ior -a POSIX -v -i 3 -m -d 1 -B -e -F -r -w -t 32m -b 4G -o ${FILESYSTEM}/test.$(date +"%Y-%m-%d_%H-%M-%S") -O summaryFormat=JSON
|
||||
sleep 2
|
||||
# Throughput test (N-1)
|
||||
mpirun -bind-to hwthread -np $CORES --hostfile $PBS_NODEFILE ${SHARED_APP}/ior/bin/ior -a POSIX -v -i 3 -m -d 1 -B -e -r -w -t 32m -b 4G -o ${FILESYSTEM}/test.$(date +"%Y-%m-%d_%H-%M-%S") -O summaryFormat=JSON
|
||||
mpirun -bind-to hwthread -np $CORES --hostfile $PBS_NODEFILE $IOR_BIN/ior -a POSIX -v -i 3 -m -d 1 -B -e -r -w -t 32m -b 4G -o ${FILESYSTEM}/test.$(date +"%Y-%m-%d_%H-%M-%S") -O summaryFormat=JSON
|
||||
sleep 2
|
||||
# IOPS test
|
||||
mpirun -bind-to hwthread -np $CORES --hostfile $PBS_NODEFILE ${SHARED_APP}/ior/bin/ior -a POSIX -v -i 3 -m -d 1 -B -e -F -r -w -t 4k -b 128M -o ${FILESYSTEM}/test.$(date +"%Y-%m-%d_%H-%M-%S") -O summaryFormat=JSON
|
||||
mpirun -bind-to hwthread -np $CORES --hostfile $PBS_NODEFILE $IOR_BIN/ior -a POSIX -v -i 3 -m -d 1 -B -e -F -r -w -t 4k -b 128M -o ${FILESYSTEM}/test.$(date +"%Y-%m-%d_%H-%M-%S") -O summaryFormat=JSON
|
||||
|
|
|
@ -17,7 +17,7 @@ numactl_options=" numactl --cpunodebind 0"
|
|||
hostlist=$(pwd)/hosts.$PBS_JOBID
|
||||
sort -u $PBS_NODEFILE > $hostlist
|
||||
# remove .internal.cloudapp.net from node names
|
||||
sed -i 's/.internal.cloudapp.net//g' $hostlist
|
||||
#sed -i 's/.internal.cloudapp.net//g' $hostlist
|
||||
BENCH=osu_bw
|
||||
|
||||
case $MODE in
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "jumpbox",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vnet_name": "hpcvnet",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"beegfs_storage_instances": 2,
|
||||
"beegfs_storage_vm_type": "Standard_D16s_v3",
|
||||
"beegfs_mgmt_vm_type": "Standard_D4s_v3",
|
||||
"beegfs_disk_type": "local_ssd",
|
||||
"beegfs_node_type": "both",
|
||||
"beegfs_pools": "false",
|
||||
"beegfs_pools_restart": "false"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"resources": {
|
||||
"beegfsm": {
|
||||
"type": "vm",
|
||||
"vm_type": "variables.beegfs_mgmt_vm_type",
|
||||
"accelerated_networking": true,
|
||||
"public_ip": false,
|
||||
"image": "variables.image",
|
||||
"subnet": "storage",
|
||||
"tags": [
|
||||
"beegfs-pkgs",
|
||||
"beegfs-mgmt",
|
||||
"beegfs",
|
||||
"beegfs-client"
|
||||
]
|
||||
},
|
||||
"beegfssm": {
|
||||
"type": "vmss",
|
||||
"vm_type": "variables.beegfs_storage_vm_type",
|
||||
"instances": "variables.beegfs_storage_instances",
|
||||
"accelerated_networking": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "storage",
|
||||
"tags": [
|
||||
"beegfs-pkgs",
|
||||
"beegfs-storage",
|
||||
"beegfs"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "disable-selinux.sh",
|
||||
"tag": "beegfs",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "cndefault.sh",
|
||||
"tag": "beegfs",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfspkgs.sh",
|
||||
"tag": "beegfs-pkgs",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfsm.sh",
|
||||
"args": [
|
||||
"/data/beegfs/mgmt"
|
||||
],
|
||||
"tag": "beegfs-mgmt",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfssd.sh",
|
||||
"args": [
|
||||
"variables.beegfs_disk_type",
|
||||
"variables.beegfs_node_type",
|
||||
"variables.beegfs_pools",
|
||||
"variables.beegfs_pools_restart",
|
||||
"$(<hostlists/tags/beegfs-mgmt)"
|
||||
],
|
||||
"tag": "beegfs-storage",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfsmd.sh",
|
||||
"args": [
|
||||
"variables.beegfs_disk_type",
|
||||
"variables.beegfs_node_type",
|
||||
"variables.beegfs_pools",
|
||||
"$(<hostlists/tags/beegfs-mgmt)"
|
||||
],
|
||||
"tag": "beegfs-storage",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfsc.sh",
|
||||
"args": [
|
||||
"$(<hostlists/tags/beegfs-mgmt)"
|
||||
],
|
||||
"tag": "beegfs-client",
|
||||
"sudo": true
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "jumpbox",
|
||||
"admin_user": "variables.admin_user",
|
||||
"variables": {
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"vnet_name": "hpcvnet",
|
||||
"key_vault": "<NOT-SET>",
|
||||
"projectstore": "<NOT-SET>",
|
||||
"admin_user": "hpcadmin",
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"jb_vm_type": "Standard_D8s_v3",
|
||||
"cc_password_secret_name": "CycleAdminPassword"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"resources": {
|
||||
"jumpbox": {
|
||||
"type": "vm",
|
||||
"vm_type": "variables.jb_vm_type",
|
||||
"accelerated_networking": true,
|
||||
"public_ip": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "admin",
|
||||
"tags": [
|
||||
"jumpbox"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"tag": "jumpbox",
|
||||
"script": "cyclecli_install.sh",
|
||||
"args": [
|
||||
"fqdn.cycleserver",
|
||||
"variables.admin_user",
|
||||
"secret.{{variables.key_vault}}.{{variables.cc_password_secret_name}}",
|
||||
"variables.resource_group",
|
||||
"sakey.{{variables.projectstore}}"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "",
|
||||
"admin_user": "variables.admin_user",
|
||||
"variables": {
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"vnet_name": "hpcvnet",
|
||||
"key_vault": "<NOT-SET>",
|
||||
"projectstore": "<NOT-SET>",
|
||||
"admin_user": "hpcadmin",
|
||||
"cc_password_secret_name": "CycleAdminPassword"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"resources": {
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"type": "local_script",
|
||||
"script": "cyclecli_install.sh",
|
||||
"args": [
|
||||
"fqdn.cycleserver",
|
||||
"variables.admin_user",
|
||||
"secret.{{variables.key_vault}}.{{variables.cc_password_secret_name}}",
|
||||
"variables.resource_group",
|
||||
"sakey.{{variables.projectstore}}"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "cycleserver",
|
||||
"admin_user": "variables.admin_user",
|
||||
"variables": {
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"vnet_name": "hpcvnet",
|
||||
"key_vault": "<NOT-SET>",
|
||||
"projectstore": "<NOT-SET>",
|
||||
"admin_user": "hpcadmin",
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"cc_vm_type": "Standard_D8s_v3",
|
||||
"cc_password_secret_name": "CycleAdminPassword"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"storage": {
|
||||
"#projectstore#": {
|
||||
"type": "storageaccount",
|
||||
"sku": "Standard_LRS"
|
||||
}
|
||||
},
|
||||
"resources": {
|
||||
"cycleserver": {
|
||||
"type": "vm",
|
||||
"vm_type": "variables.cc_vm_type",
|
||||
"accelerated_networking": true,
|
||||
"public_ip": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "admin",
|
||||
"managed_identity": {
|
||||
"role": "contributor",
|
||||
"scope": "resource_group"
|
||||
},
|
||||
"tags": [
|
||||
"cycle"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "disable-selinux.sh",
|
||||
"tag": "cycle",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "cc_install_managed_identity.sh",
|
||||
"tag": "cycle",
|
||||
"sudo": true,
|
||||
"args": [
|
||||
"cycleserver",
|
||||
"variables.admin_user",
|
||||
"secret.{{variables.key_vault}}.{{variables.cc_password_secret_name}}",
|
||||
"variables.projectstore"
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "local_script",
|
||||
"script": "cc_nsg.sh",
|
||||
"args": [
|
||||
"variables.resource_group",
|
||||
"cycleserver"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"admin_user": "variables.admin_user",
|
||||
"variables": {
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"key_vault": "<NOT-SET>",
|
||||
"admin_user": "hpcadmin",
|
||||
"cc_password_secret_name": "CycleAdminPassword"
|
||||
},
|
||||
"vnet": {
|
||||
},
|
||||
"resources": {},
|
||||
"install": [
|
||||
{
|
||||
"type": "local_script",
|
||||
"script": "create_keyvault.sh",
|
||||
"args": [
|
||||
"variables.resource_group",
|
||||
"variables.key_vault"
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "local_script",
|
||||
"script": "create_password_in_keyvault.sh",
|
||||
"args": [
|
||||
"variables.key_vault",
|
||||
"variables.cc_password_secret_name"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "jumpbox",
|
||||
"admin_user": "variables.admin_user",
|
||||
"variables": {
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"vnet_name": "hpcvnet",
|
||||
"admin_user": "hpcadmin",
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"jb_vm_type": "Standard_D8s_v3"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"resources": {
|
||||
"jumpbox": {
|
||||
"type": "vm",
|
||||
"vm_type": "variables.jb_vm_type",
|
||||
"accelerated_networking": true,
|
||||
"public_ip": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "admin",
|
||||
"data_disks": [
|
||||
1024,
|
||||
1024
|
||||
],
|
||||
"storage_sku": "Premium_LRS",
|
||||
"tags": [
|
||||
"jumpbox"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "disable-selinux.sh",
|
||||
"tag": "jumpbox",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "cndefault.sh",
|
||||
"tag": "jumpbox",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "create_raid0.sh",
|
||||
"tag": "jumpbox",
|
||||
"args": [
|
||||
"/dev/md10",
|
||||
"/dev/sd[c-d]"
|
||||
],
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "make_filesystem.sh",
|
||||
"tag": "jumpbox",
|
||||
"args": [
|
||||
"/dev/md10",
|
||||
"xfs",
|
||||
"/share"
|
||||
],
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "install-nfsserver.sh",
|
||||
"tag": "jumpbox",
|
||||
"args": [
|
||||
"/share"
|
||||
],
|
||||
"sudo": true
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "jumpbox",
|
||||
"admin_user": "variables.admin_user",
|
||||
"variables": {
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"vnet_name": "hpcvnet",
|
||||
"admin_user": "hpcadmin",
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"jb_vm_type": "Standard_D8s_v3"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"resources": {
|
||||
"jumpbox": {
|
||||
"type": "vm",
|
||||
"vm_type": "variables.jb_vm_type",
|
||||
"accelerated_networking": true,
|
||||
"public_ip": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "admin",
|
||||
"tags": [
|
||||
"jumpbox"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "disable-selinux.sh",
|
||||
"tag": "jumpbox",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "cndefault.sh",
|
||||
"tag": "jumpbox",
|
||||
"sudo": true
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
# Building Blocks
|
||||
This directory contains reusable configuration file building blocks which can be used with the `init-and-merge.sh` script to produce a unique config file. The variables have been named to avoid collision with other blocks. Most blocks assume that a VNET exists (hence the dependency on the `vnet.json` block) and being installed from a jumpbox.
|
||||
|
||||
|
||||
## Block list
|
||||
|
||||
| Name | Description | Dependency on |
|
||||
|-----------------------------------------------|------------------------------------------------------------------------------------------------------|-----------------------|
|
||||
| **beegfs-cluster.json** | Create BeeGFS cluster | `jumpbox`, `vnet` |
|
||||
| **cycle-cli-jumpbox.json** | Install the CycleCloud CLI on the jumpbox | `cycle-prereqs-managed-identity`, `vnet` |
|
||||
| **cycle-cli-local.json** | Install the CycleCloud CLI locally | `cycle-prereqs-managed-identity`, `vnet` |
|
||||
| **cycle-install-server-managed-identity.json**| Create a CycleCloud server in a managed identity context | `cycle-prereqs-managed-identity`, `jumpbox`, `vnet`|
|
||||
| **cycle-prereqs-managed-identity.json** | Create all pre-requisites for deploying CycleCloud with managed identity | |
|
||||
| **jumpbox.json** | Create a jumpbox in the admin subnet | Existence of a VNET |
|
||||
| **jumpbox-nfs.json** | Create a jumpbox in the admin subnet acting as a 2TB NFS server | Existence of a VNET |
|
||||
| **vnet.json** | Create a vnet named `hpcvnet` 10.2.0.0/20 with subnets admin, compute, netapp, viz and storage | |
|
||||
|
||||
|
||||
|
||||
## How to use building blocks ?
|
||||
It's easy as the blocks have been designed to be merged together into a single configuration file. The `init-and-merge.sh` will do the merge and initialization and you just need to provide an ordered list of the blocks you want to use and json file containing the variables you want to set.
|
||||
The `init.sh` script below will create a config file to deploy a VNET, a JUMPBOX with NFS and a BEEGFS cluster.
|
||||
|
||||
```
|
||||
#/bin/bash
|
||||
block_dir=$azhpc_dir/blocks
|
||||
AZHPC_CONFIG=config.json
|
||||
AZHPC_VARIABLES=variables.json
|
||||
|
||||
blocks="$block_dir/vnet.json $block_dir/jumpbox-nfs.json $block_dir/beegfs-cluster.json"
|
||||
|
||||
# Initialize config file
|
||||
echo "{}" >$AZHPC_CONFIG
|
||||
$azhpc_dir/init-and-merge.sh "$blocks" $AZHPC_CONFIG $AZHPC_VARIABLES
|
||||
```
|
||||
|
||||
Before running that script I need to create a `variables.json` file which contains all the `<NOT-SET>` values of my blocks. In our case these are only `resource_group` and `location`.
|
||||
```json
|
||||
{
|
||||
"variables": {
|
||||
"resource_group": "my resource group",
|
||||
"location": "my location"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Once done I can run the init script wich produce a config.json that I can use to build my whole environment.
|
||||
|
||||
```
|
||||
$ ./init.sh
|
||||
$ azhpc-build
|
||||
```
|
||||
|
||||
> NOTE : Have a look at the examples/cc_beegfs directory for a detailed example
|
|
@ -0,0 +1,27 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "",
|
||||
"admin_user": "variables.admin_user",
|
||||
"variables": {
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"vnet_name": "hpcvnet",
|
||||
"admin_user": "hpcadmin"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name",
|
||||
"address_prefix": "10.2.0.0/20",
|
||||
"subnets": {
|
||||
"admin": "10.2.1.0/24",
|
||||
"netapp": "10.2.2.0/24",
|
||||
"storage": "10.2.3.0/24",
|
||||
"viz": "10.2.4.0/24",
|
||||
"compute": "10.2.8.0/22"
|
||||
}
|
||||
},
|
||||
"resources": {},
|
||||
"install": []
|
||||
}
|
34
changelog.md
34
changelog.md
|
@ -1,4 +1,32 @@
|
|||
# azhpc Change log
|
||||
# AzureHPC Change log
|
||||
|
||||
- [PR 298](https://github.com/Azure/azurehpc/pull/298) : CycleCloud integration
|
||||
- added Cycle 7.9.x project support
|
||||
- added storage account support
|
||||
- re-evaluate variables after resource creation and before executing install scripts
|
||||
- install CycleServer 7.9.x with Managed identity
|
||||
- added building blocks
|
||||
- added init-and-merge.sh script to initialize and merge building blocks to a unique configuration file
|
||||
- allow the whole variables section to be referenced in an external file
|
||||
- full end to end BeeGFS + Cycle 7.9.x + PBS example in examples/cc_beegfs
|
||||
- Lustre cluster deployed using Availability Set in examples/lustre_avset
|
||||
- [PR 292](https://github.com/Azure/azurehpc/pull/277) : Fix to allow minor version choice for lustre
|
||||
- [PR 277](https://github.com/Azure/azurehpc/pull/277) : Single node health check script example
|
||||
- [PR 266](https://github.com/Azure/azurehpc/pull/266) : Add --no-vnet option to not deploy VNETs when chaining deployments
|
||||
- [PR 263](https://github.com/Azure/azurehpc/pull/263) : Support symlinks in azhpc-init
|
||||
- [PR 255](https://github.com/Azure/azurehpc/pull/255) : Added SLURM
|
||||
- [PR 249](https://github.com/Azure/azurehpc/pull/249) : Added GlusterFS
|
||||
- [PR 247](https://github.com/Azure/azurehpc/pull/247) : Shearwater Reveal E2E deployment example
|
||||
- [PR 245](https://github.com/Azure/azurehpc/pull/245) : Removed bash version and simplified install
|
||||
- [PR 241](https://github.com/Azure/azurehpc/pull/241) : better support windows with default rdp on public ip and azhpc-connect
|
||||
- [PR 231](https://github.com/Azure/azurehpc/pull/231) : WRF update (to 4.1.5, previous version 4.1.3)
|
||||
- [PR 230](https://github.com/Azure/azurehpc/pull/230) : Spack update (v0.14.2)
|
||||
- [PR 215](https://github.com/Azure/azurehpc/pull/215) : Support custom image
|
||||
- [PR 214](https://github.com/Azure/azurehpc/pull/214) : Changed os and data disk defaults to Premium SSD
|
||||
- [PR 212](https://github.com/Azure/azurehpc/pull/212) : Add Managed Identity support for VMs
|
||||
- [PR 209](https://github.com/Azure/azurehpc/pull/209) : Change OS disk size default
|
||||
- [PR 206](https://github.com/Azure/azurehpc/pull/206) : Added availability set support
|
||||
- [PR 203](https://github.com/Azure/azurehpc/pull/203) : Added support for zones
|
||||
- [PR 202](https://github.com/Azure/azurehpc/pull/202) : Updated to add color
|
||||
- [PR 194](https://github.com/Azure/azurehpc/pull/194) : Added Grafana and Telegraf
|
||||
|
||||
* 01-Apr-2020 : Added Grafana and Telegraf
|
||||
* 04-Feb-2020 : Added permission option to sasurl variable value. Append ",rw" for read write access. Default is read.
|
||||
|
|
|
@ -51,7 +51,7 @@ config_file=$(basename $AZHPC_CONFIG)
|
|||
# clean up project dir
|
||||
if [ -d $PROJECT_DIR ]; then
|
||||
ls -al $PROJECT_DIR
|
||||
rm -rf $PROJECT_DIR
|
||||
# rm -rf $PROJECT_DIR
|
||||
fi
|
||||
|
||||
echo "Calling azhpc-init"
|
||||
|
@ -64,6 +64,7 @@ echo "********************************************************************"
|
|||
echo "* BUILD RESOURCES *"
|
||||
echo "********************************************************************"
|
||||
echo "Calling azhpc-build"
|
||||
export PATH=$PATH:$HOME/bin # add that path for any CycleCloud calls
|
||||
azhpc-build -c $config_file $AZHPC_OPTION
|
||||
return_code=$?
|
||||
cat deploy*.json
|
||||
|
|
|
@ -62,3 +62,13 @@ jobs:
|
|||
echo "removing resource group $AZHPC_RESOURCEGROUP"
|
||||
az group delete -n $AZHPC_RESOURCEGROUP -y --no-wait
|
||||
fi
|
||||
conf_dir=$(dirname $AZHPC_CONFIG)
|
||||
if [ "$PROJECT_DIR" = "" ]; then
|
||||
PROJECT_DIR=${conf_dir##*/}
|
||||
fi
|
||||
# clean up project dir
|
||||
if [ -d $PROJECT_DIR ]; then
|
||||
ls -al $PROJECT_DIR
|
||||
rm -rf $PROJECT_DIR
|
||||
fi
|
||||
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
parameters:
|
||||
timeout: 60
|
||||
continueOnError: true
|
||||
|
||||
steps:
|
||||
- task : AzureCLI@2
|
||||
displayName: "Build Configuration File"
|
||||
timeoutInMinutes: ${{ parameters.timeout }}
|
||||
inputs:
|
||||
azureSubscription: 'target-subscription'
|
||||
scriptLocation: inlineScript
|
||||
scriptType: 'bash'
|
||||
inlineScript: |
|
||||
if [ "$SYSTEM_DEBUG" = "true" ]; then
|
||||
set -x
|
||||
AZHPC_OPTION="--debug"
|
||||
fi
|
||||
|
||||
echo "********************************************************************"
|
||||
echo "* INIT CONFIG VARIABLES *"
|
||||
echo "********************************************************************"
|
||||
# AZHPC_UUID is set when creating the RG unique name when starting the pipeline
|
||||
export AZHPC_VARIABLES_UUID=${AZHPC_UUID-azhpc}
|
||||
|
||||
azhpc_variables=$(printenv | grep AZHPC_VARIABLES)
|
||||
init_variables="-v resource_group=$AZHPC_RESOURCEGROUP"
|
||||
for item in $azhpc_variables; do
|
||||
key=$(echo $item | cut -d '=' -f1)
|
||||
value=$(echo $item | cut -d '=' -f2)
|
||||
variable=${key#AZHPC_VARIABLES_}
|
||||
variable=${variable,,}
|
||||
init_variables+=",$variable=$value"
|
||||
done
|
||||
|
||||
echo $init_variables
|
||||
. install.sh
|
||||
conf_dir=$(dirname $AZHPC_CONFIG)
|
||||
if [ "$PROJECT_DIR" = "" ]; then
|
||||
PROJECT_DIR=${conf_dir##*/}
|
||||
fi
|
||||
config_file=$(basename $AZHPC_CONFIG)
|
||||
|
||||
echo "Calling azhpc-init"
|
||||
azhpc-init $AZHPC_OPTION -c $BUILD_REPOSITORY_LOCALPATH/$conf_dir/$config_file -d $PROJECT_DIR $init_variables || exit 1
|
||||
pushd $PROJECT_DIR
|
||||
|
||||
cp $BUILD_REPOSITORY_LOCALPATH/$conf_dir/init.sh .
|
||||
|
||||
./init.sh
|
||||
|
||||
echo "Content of config.json"
|
||||
jq '.' config.json
|
||||
echo "Content of variables.json"
|
||||
jq '.' variables.json
|
||||
|
||||
if [ -e prereqs.json ]; then
|
||||
echo "Content of prereqs.json"
|
||||
jq '.' prereqs.json
|
||||
azhpc-build $AZHPC_OPTION --no-vnet -c prereqs.json
|
||||
fi
|
||||
|
||||
# Override AZHPC_CONFIG so that the build command executed by the job pipeline will use this config config_file
|
||||
export AZHPC_CONFIG=$conf_dir/config.json
|
||||
echo "##vso[task.setvariable variable=azhpc.config]$AZHPC_CONFIG"
|
||||
|
||||
failOnStandardError: false
|
||||
continueOnError: ${{ parameters.continueOnError }}
|
40
config.json
40
config.json
|
@ -38,7 +38,9 @@
|
|||
"ad_username": "User to use to join the domain",
|
||||
"joindomain": "Domain name to join to",
|
||||
"subnet": "subnet name in which to inject ANF NICs",
|
||||
"type": "anf",
|
||||
"type": "anf, azurestorage",
|
||||
"sku": "Standard_LRS, Standard_GRS, Standard_RAGRS, Standard_ZRS, Premium_LRS, Premium_ZRS, Standard_GZRS, Standard_RAGZRS",
|
||||
"containers": [ "container1", "container2"],
|
||||
"pools": {
|
||||
"pool1": {
|
||||
"service_level": "Ultra, Premium, Standard",
|
||||
|
@ -114,5 +116,39 @@
|
|||
"sudo": "true, (false)",
|
||||
"reboot": "true, (false)"
|
||||
}
|
||||
]
|
||||
],
|
||||
"cyclecloud": {
|
||||
"clusters": {
|
||||
"clustername": {
|
||||
"template": "templatename",
|
||||
"parameters": {
|
||||
"parameter1": "value1",
|
||||
"parameter2": "value2",
|
||||
"xxxClusterInitSpecs": {
|
||||
"projectname:specname:i.j.k": {
|
||||
"Order": 10000,
|
||||
"Name": "projectname:specname:i.j.k",
|
||||
"Spec": "specname",
|
||||
"Project": "projectname",
|
||||
"Version": "i.j.k",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"projects": {
|
||||
"projectname:specname:i.j.k": [
|
||||
{
|
||||
"script": "script1.sh",
|
||||
"args": [ "arg1", "arg2"],
|
||||
"deps": ["dep1.data", "dep2.json"]
|
||||
},
|
||||
{
|
||||
"script": "script2.sh",
|
||||
"args": [ "arg1", "arg2"]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
# AzureHPC and CycleCloud integration FAQ
|
||||
With this intgration, AzureHPC can create CycleCloud clusters with a set a prefix values for the cluster parameters as well as adding cluster-init specs to reuse existing AzureHPC scripts like the one provided for the storage clients for example. The main principle with the integration is to leave untouch the CycleCloud cluster template and to leverage cluster-init to extend any existing specs defined in the template.
|
||||
|
||||
## how to retrieve the list of existing templates ?
|
||||
|
||||
If you are not using a customize template for which you know it's exact name, one way of retrieving the list of pre-set templates full names is to run the following command:
|
||||
```
|
||||
$ cyclecloud show_cluster -t
|
||||
--------------------------------------
|
||||
gridengine_template_1.0.0 : *template*
|
||||
--------------------------------------
|
||||
Resource group:
|
||||
Cluster nodes:
|
||||
master: Off -- --
|
||||
Total nodes: 1
|
||||
------------------------------------
|
||||
htcondor_template_1.0.1 : *template*
|
||||
------------------------------------
|
||||
Resource group:
|
||||
Cluster nodes:
|
||||
master: Off -- --
|
||||
Total nodes: 1
|
||||
-------------------------------
|
||||
lsf_template_3.2.2 : *template*
|
||||
-------------------------------
|
||||
Resource group:
|
||||
Cluster nodes:
|
||||
proxy: Off -- --
|
||||
Total nodes: 1
|
||||
---------------------------------
|
||||
pbspro_template_1.3.7 : *template*
|
||||
----------------------------------
|
||||
Resource group:
|
||||
Cluster nodes:
|
||||
master: Off -- --
|
||||
Total nodes: 1
|
||||
---------------------------------
|
||||
slurm_template_2.1.0 : *template*
|
||||
---------------------------------
|
||||
Resource group:
|
||||
Cluster nodes:
|
||||
master: Off -- --
|
||||
Total nodes: 1
|
||||
```
|
||||
|
||||
## how to know which parameters can be used in the config file ?
|
||||
|
||||
Parameters are defined in templates, and if you haven't created the template yourself or don't have access to the template text file, you can export parameters of an existing cluster with this command :
|
||||
|
||||
```
|
||||
$ cyclecloud export_parameters pbscycle
|
||||
{
|
||||
"MaxExecuteCoreCount" : 1000,
|
||||
"MasterMachineType" : "Standard_D8s_v3",
|
||||
"UsePublicNetwork" : false,
|
||||
"ReturnProxy" : false,
|
||||
"Credentials" : "azure",
|
||||
"Autoscale" : true,
|
||||
"SubnetId" : "myrg/hpcvnet/compute",
|
||||
"UseLowPrio" : false,
|
||||
"Region" : "westeurope",
|
||||
"NumberLoginNodes" : 0,
|
||||
"MasterClusterInitSpecs" : null,
|
||||
"ExecuteMachineType" : "Standard_HB60rs",
|
||||
"pbspro" : null,
|
||||
"ImageName" : "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"ExecuteNodesPublic" : false,
|
||||
"ExecuteClusterInitSpecs" : null
|
||||
}
|
||||
```
|
||||
If you haven't assigned any ClusterInit specs these values will be `null` as above. If a ClusterInit spec have been assigned then it' content would be :
|
||||
|
||||
```json
|
||||
"MasterClusterInitSpecs" : {
|
||||
"azurehpc:default:1.0.0" : {
|
||||
"Order" : 10000,
|
||||
"Spec" : "default",
|
||||
"Name" : "azurehpc:default:1.0.0",
|
||||
"Project" : "azurehpc",
|
||||
"Locker" : "azure-storage",
|
||||
"Version" : "1.0.0"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This full JSON content will have to be included in the AzureHPC configuration file.
|
||||
|
||||
## What is the CycleCloud project content generated by AzureHPC ?
|
||||
The goal of the integration is to make an easy use of the scripts provided by AzureHPC. The project dictionary in the configuration file will list which scripts to call, with which arguments and any additional file dependencies. The `azhpc build` command will use the CycleCloud CLI to initialize a project and specs under the `azhpc_install_<configname>/projects` directory. All referenced files and dependencies are copied in the `<projectname>/specs/<specname>/cluster-init/files` and wrappers are generated in the `<projectname>/specs/<specname>/cluster-init/scripts`. Then the project is uploaded in the `azure-storage` CycleCloud locker.
|
||||
|
||||
## How to debug the cluster-init scripts ?
|
||||
Connect on the machine which is supposed to run the cluster-init scripts. The execution output is stored under the `/opt/cycle/jetpack/logs/cluster-init/<projectname>`, in addition look at the `/opt/cycle/jetpack/logs/jetpack.log` and `/opt/cycle/jetpack/logs/chef-client.log`.
|
||||
|
|
@ -27,7 +27,7 @@
|
|||
}
|
||||
},
|
||||
"storage": {
|
||||
"hpcnetappfull": {
|
||||
"hpccatnetappfull": {
|
||||
"type": "anf",
|
||||
"subnet": "storage",
|
||||
"pools": {
|
||||
|
|
|
@ -0,0 +1,129 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "jumpbox",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"storage_instances": 2,
|
||||
"storage_vm_type": "Standard_D16s_v3",
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"beegfs_disk_type": "local_ssd",
|
||||
"beegfs_node_type": "both",
|
||||
"beegfs_pools": "false",
|
||||
"beegfs_pools_restart": "false"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "hpcvnet",
|
||||
"address_prefix": "10.2.0.0/20",
|
||||
"subnets": {
|
||||
"admin": "10.2.1.0/24",
|
||||
"storage": "10.2.3.0/24",
|
||||
"compute": "10.2.4.0/22"
|
||||
}
|
||||
},
|
||||
"resources": {
|
||||
"jumpbox": {
|
||||
"type": "vm",
|
||||
"vm_type": "Standard_D8s_v3",
|
||||
"accelerated_networking": true,
|
||||
"public_ip": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "admin",
|
||||
"tags": [
|
||||
"cndefault",
|
||||
"disable-selinux",
|
||||
"beegfspkgs",
|
||||
"beegfsc"
|
||||
]
|
||||
},
|
||||
"beegfsm": {
|
||||
"type": "vm",
|
||||
"vm_type": "Standard_D4s_v3",
|
||||
"accelerated_networking": true,
|
||||
"public_ip": false,
|
||||
"image": "variables.image",
|
||||
"subnet": "storage",
|
||||
"tags": [
|
||||
"beegfspkgs",
|
||||
"beegfsm",
|
||||
"disable-selinux",
|
||||
"beegfsc"
|
||||
]
|
||||
},
|
||||
"beegfssm": {
|
||||
"type": "vmss",
|
||||
"vm_type": "variables.storage_vm_type",
|
||||
"instances": "variables.storage_instances",
|
||||
"accelerated_networking": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "storage",
|
||||
"tags": [
|
||||
"beegfspkgs",
|
||||
"beegfssd",
|
||||
"beegfsmd",
|
||||
"cndefault",
|
||||
"disable-selinux"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "disable-selinux.sh",
|
||||
"tag": "disable-selinux",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "cndefault.sh",
|
||||
"tag": "cndefault",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfspkgs.sh",
|
||||
"tag": "beegfspkgs",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfsm.sh",
|
||||
"args": [
|
||||
"/data/beegfs/mgmt"
|
||||
],
|
||||
"tag": "beegfsm",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfssd.sh",
|
||||
"args": [
|
||||
"variables.beegfs_disk_type",
|
||||
"variables.beegfs_node_type",
|
||||
"variables.beegfs_pools",
|
||||
"variables.beegfs_pools_restart",
|
||||
"$(<hostlists/tags/beegfsm)"
|
||||
],
|
||||
"tag": "beegfssd",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfsmd.sh",
|
||||
"args": [
|
||||
"variables.beegfs_disk_type",
|
||||
"variables.beegfs_node_type",
|
||||
"variables.beegfs_pools",
|
||||
"$(<hostlists/tags/beegfsm)"
|
||||
],
|
||||
"tag": "beegfsmd",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfsc.sh",
|
||||
"args": [
|
||||
"$(<hostlists/tags/beegfsm)"
|
||||
],
|
||||
"tag": "beegfsc",
|
||||
"sudo": true
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
#/bin/bash
|
||||
block_dir=$azhpc_dir/blocks
|
||||
AZHPC_CONFIG=config.json
|
||||
AZHPC_VARIABLES=variables.json
|
||||
|
||||
blocks="$block_dir/vnet.json $block_dir/jumpbox-nfs.json $block_dir/cycle-install-server-managed-identity.json $block_dir/cycle-cli-local.json $block_dir/cycle-cli-jumpbox.json $block_dir/beegfs-cluster.json $azhpc_dir/examples/cc_beegfs/pbscycle.json"
|
||||
|
||||
# Initialize config file
|
||||
echo "{}" >$AZHPC_CONFIG
|
||||
$azhpc_dir/init-and-merge.sh "$blocks" $AZHPC_CONFIG $AZHPC_VARIABLES
|
||||
|
||||
echo "{}" >prereqs.json
|
||||
prereqs="$block_dir/cycle-prereqs-managed-identity.json"
|
||||
$azhpc_dir/init-and-merge.sh $prereqs prereqs.json $AZHPC_VARIABLES
|
||||
|
||||
# Update locker name
|
||||
locker=$(azhpc-get -c $AZHPC_VARIABLES variables.projectstore | cut -d '=' -f2 | xargs)
|
||||
sed -i "s/#projectstore#/$locker/g" $AZHPC_CONFIG
|
|
@ -0,0 +1,2 @@
|
|||
steps:
|
||||
- template: ../../ci/templates/steps/build-config.yml
|
|
@ -0,0 +1,84 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"hpc_image": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vm_type": "Standard_HB60rs",
|
||||
"vnet_name": "hpcvnet",
|
||||
"vnet_resource_group": "variables.resource_group"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"cyclecloud": {
|
||||
"clusters": {
|
||||
"pbscycle": {
|
||||
"template": "pbspro_template_1.3.7",
|
||||
"parameters": {
|
||||
"MaxExecuteCoreCount": 1000,
|
||||
"MasterMachineType": "Standard_D8s_v3",
|
||||
"Credentials": "azure",
|
||||
"Autoscale": true,
|
||||
"SubnetId": "{{variables.resource_group}}/{{variables.vnet_name}}/compute",
|
||||
"UseLowPrio": false,
|
||||
"UsePublicNetwork": false,
|
||||
"ReturnProxy": false,
|
||||
"Region": "variables.location",
|
||||
"MasterClusterInitSpecs": {
|
||||
"azurehpc:default:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:default:1.0.0",
|
||||
"Spec": "default",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"ExecuteMachineType": "variables.vm_type",
|
||||
"ImageName": "variables.hpc_image",
|
||||
"ExecuteClusterInitSpecs": {
|
||||
"azurehpc:default:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:default:1.0.0",
|
||||
"Spec": "default",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"projects": {
|
||||
"azurehpc:default:1.0.0": [
|
||||
{
|
||||
"script": "beegfspkgs.sh"
|
||||
},
|
||||
{
|
||||
"script": "beegfsc.sh",
|
||||
"args": [ "beegfsm" ]
|
||||
},
|
||||
{
|
||||
"script": "auto_mount.sh",
|
||||
"args": [ "jumpbox:/share/apps", "/apps"]
|
||||
},
|
||||
{
|
||||
"script": "auto_mount.sh",
|
||||
"args": [ "jumpbox:/share/data", "/data"]
|
||||
},
|
||||
{
|
||||
"script": "cndefault.sh",
|
||||
"deps": ["azhpc4cycle.sh"]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"resources": {
|
||||
},
|
||||
"install": [
|
||||
]
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
pr: none
|
||||
trigger:
|
||||
batch: true
|
||||
branches:
|
||||
include:
|
||||
- master
|
||||
- '*'
|
||||
paths:
|
||||
include:
|
||||
- examples/cc_beegfs/*
|
||||
exclude:
|
||||
- examples/cc_beegfs/readme.md
|
||||
|
||||
pool: Default
|
||||
|
||||
variables:
|
||||
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
||||
# configuration file to use
|
||||
azhpc.config: examples/cc_beegfs/variables.json
|
||||
# pipeline directory
|
||||
azhpc.pipeline_dir: examples/cc_beegfs
|
||||
# destination of scripts. Default is hpcuser@headnode:/apps
|
||||
azhpc.script_remote_dest: 'hpcadmin@jumpbox:.'
|
||||
|
||||
# Add the variables needed in your configuration file
|
||||
# Uncomment and set values below, or leave commented and thru pipeline variables
|
||||
# azhpc.variables.location: westeurope
|
||||
|
||||
|
||||
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
jobs:
|
||||
- template: ../../ci/templates/jobs/build.yml
|
||||
parameters:
|
||||
pre_steps: ../../../examples/cc_beegfs/init.yml
|
|
@ -0,0 +1,250 @@
|
|||
# Building the infrastructure
|
||||
Here we will explain how to deploy a full system with a VNET, JUMPBOX, CYCLESERVER and BEEGFS by using building blocks.
|
||||
|
||||
## Step 1 - install azhpc
|
||||
after cloning azhpc, source the install.sh script
|
||||
|
||||
```
|
||||
$ git clone https://github.com/Azure/azurehpc.git
|
||||
$ cd azurehpc
|
||||
$ . install.sh
|
||||
$ cd ..
|
||||
```
|
||||
|
||||
## Step 2 - Initialize the configuration files
|
||||
Create a working directory from where you will do the deployment and configuration update. Don't work directly from the cloned repo.
|
||||
|
||||
```
|
||||
$ mkdir cc_beegfs
|
||||
$ cd cc_beegfs
|
||||
```
|
||||
|
||||
Then copy the init.sh and variables.json from examples/cc_beegfs to your working directory.
|
||||
|
||||
```
|
||||
$ cp $azhpc_dir/examples/cc_beegfs/init.sh .
|
||||
$ cp $azhpc_dir/examples/cc_beegfs/variables.json .
|
||||
```
|
||||
|
||||
Edit the variables.json to match your environment. Give a unique value to `uuid`. An existing keyvault should be referenced as it won't be created for you.
|
||||
|
||||
```json
|
||||
{
|
||||
"variables": {
|
||||
"resource_group": "my resource group",
|
||||
"location": "my location",
|
||||
"key_vault": "my key vault",
|
||||
"uuid": "unique value",
|
||||
"projectstore": "locker{{variables.uuid}}"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Run the init.sh script which will copy all the config files of the building blocks and initialize the variables by using the variables.json updated above.
|
||||
|
||||
```
|
||||
$ ./init.sh
|
||||
```
|
||||
|
||||
## Step 2 - Build the system
|
||||
|
||||
The first command will create the required pre-requisites for CycleCloud like a Key Vault, generate a password and store it in the Vault.
|
||||
The second command will buil all the resources and create a PBS cluster.
|
||||
|
||||
```
|
||||
$ azhpc-build --no-vnet -c prereqs.json
|
||||
$ azhpc-build
|
||||
```
|
||||
The build process should take about 13 minutes.
|
||||
|
||||
## Step 3 - Upload application scripts
|
||||
|
||||
Upload the AzureHPC application scripts onto the /apps share created on the Jumpbox. These scripts will be used from the master and compute nodes provisioned by CycleCloud.
|
||||
|
||||
```
|
||||
$ azhpc-scp -- -r $azhpc_dir/apps/. hpcadmin@jumpbox:/apps
|
||||
```
|
||||
|
||||
## Step 4 - Start the PBS cluster in CycleCloud
|
||||
|
||||
To Start the PBS cluster attached to BeeGFS:
|
||||
|
||||
```
|
||||
$ cyclecloud start_cluster pbscycle
|
||||
```
|
||||
|
||||
Retrieve the cluster status by running this
|
||||
```
|
||||
$ cyclecloud show_cluster pbscycle | grep master | xargs | cut -d ' ' -f 2
|
||||
$ cyclecloud show_nodes -c pbscycle --format=json | jq -r '.[0].State'
|
||||
```
|
||||
|
||||
## Step 5 - Connect to CycleServer UI
|
||||
|
||||
Retrieve the CycleServer DNS name from the azure portal
|
||||
|
||||
Retrieve the CycleCloud admin password from the logs
|
||||
|
||||
```
|
||||
$ grep password azhpc_install_config/install/*.log
|
||||
```
|
||||
|
||||
Connect to the CycleCloud Web Portal `https://fqdn-of-cycleserver` as `hpcadmin` and the password retrieved above. Check that you have a `pbscycle` cluster.
|
||||
Check that the pbscycle master is well started or wait until it is started, allow about 12 minutes for the master to start.
|
||||
|
||||
Manually add few nodes to the cluster.
|
||||
|
||||
# Running applications
|
||||
AzureHPC comes with a set of prebuild application scripts which have been copied over the `/apps` share. We will use this to run some examples.
|
||||
|
||||
## Step 1 - Connect to the master
|
||||
From the machine and directory you have deployed the infrastructure defined above, connec to the master.
|
||||
|
||||
```
|
||||
$ azhpc-connect jumpbox
|
||||
[2020-06-16 17:15:46] logging directly into jumpbox0e70ce.westeurope.cloudapp.azure.com
|
||||
Last login: Tue Jun 16 16:54:22 2020 from 137.116.212.169
|
||||
[hpcadmin@jumpbox ~]$ cyclecloud connect master -c pbscycle
|
||||
Connecting to hpcadmin@10.2.8.4 (pbscycle master) using SSH
|
||||
|
||||
__ __ | ___ __ | __ __|
|
||||
(___ (__| (___ |_, (__/_ (___ |_, (__) (__(_ (__|
|
||||
|
|
||||
|
||||
Cluster: pbscycle
|
||||
Version: 7.9.6
|
||||
Run List: recipe[cyclecloud], role[pbspro_master_role], recipe[cluster_init]
|
||||
[hpcadmin@ip-0A020804 ~]$
|
||||
```
|
||||
|
||||
## Step 2 - Check that Beegfs and NFS shared are mounted
|
||||
|
||||
```
|
||||
[hpcadmin@ip-0A020804 ~]$ df
|
||||
Filesystem 1K-blocks Used Available Use% Mounted on
|
||||
devtmpfs 16451984 0 16451984 0% /dev
|
||||
tmpfs 16463856 0 16463856 0% /dev/shm
|
||||
tmpfs 16463856 9264 16454592 1% /run
|
||||
tmpfs 16463856 0 16463856 0% /sys/fs/cgroup
|
||||
/dev/sda2 30416376 10572700 19843676 35% /
|
||||
/dev/sda1 505580 65552 440028 13% /boot
|
||||
/dev/sda15 506608 11328 495280 3% /boot/efi
|
||||
/dev/sdb1 65923564 53276 62498516 1% /mnt/resource
|
||||
tmpfs 3292772 0 3292772 0% /run/user/20002
|
||||
beegfs_nodev 263958528 651264 263307264 1% /beegfs
|
||||
jumpbox:/share/apps 2146156736 62080 2146094656 1% /apps
|
||||
jumpbox:/share/data 2146156736 62080 2146094656 1% /data
|
||||
[hpcadmin@ip-0A020804 ~]
|
||||
```
|
||||
|
||||
Check that the /apps directory contains all the AzureHPC application scripts
|
||||
|
||||
```
|
||||
[hpcadmin@ip-0A020804 ~]$ ls -al /apps
|
||||
total 20
|
||||
drwxrwxrwx. 31 root root 4096 Jun 18 07:59 .
|
||||
dr-xr-xr-x. 20 root root 4096 Jun 18 08:34 ..
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 79 Jun 18 07:59 abaqus
|
||||
drwxr-xr-x. 3 cyclecloud cyclecloud 92 Jun 18 07:59 ansys_mechanical
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 111 Jun 18 07:59 convergecfd
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 102 Jun 18 07:59 fio
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 74 Jun 18 07:59 fluent
|
||||
drwxr-xr-x. 8 cyclecloud cyclecloud 220 Jun 18 07:57 .git
|
||||
-rw-r--r--. 1 cyclecloud cyclecloud 5907 Jun 18 07:57 .gitignore
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 72 Jun 18 07:59 gromacs
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 268 Jun 18 07:59 health_checks
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 147 Jun 18 07:59 imb-mpi
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 134 Jun 18 07:59 intersect
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 85 Jun 18 07:59 io500
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 89 Jun 18 07:59 ior
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 78 Jun 18 07:59 lammps
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 102 Jun 18 07:59 linpack
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 82 Jun 18 07:59 namd
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 77 Jun 18 07:59 nwchem
|
||||
drwxr-xr-x. 4 cyclecloud cyclecloud 235 Jun 18 07:59 openfoam_org
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 72 Jun 18 07:59 openmpi
|
||||
drwxr-xr-x. 3 cyclecloud cyclecloud 108 Jun 18 07:59 opm
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 112 Jun 18 07:59 osu
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 92 Jun 18 07:59 pamcrash
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 57 Jun 18 07:59 paraview
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 131 Jun 18 07:59 prolb
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 123 Jun 18 07:59 radioss
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 44 Jun 18 07:59 resinsight
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 31 Jun 18 07:59 reveal
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 256 Jun 18 07:59 spack
|
||||
drwxr-xr-x. 3 cyclecloud cyclecloud 109 Jun 18 07:59 starccm
|
||||
drwxr-xr-x. 2 cyclecloud cyclecloud 4096 Jun 18 07:59 wrf
|
||||
[hpcadmin@ip-0A020804 ~]$
|
||||
```
|
||||
|
||||
> NOTE : You should set ownership of the /apps to the hpcadmin user with : `sudo chown -R hpcadmin:hpcadmin /apps`
|
||||
|
||||
## Step 3 - Testing storage with IOR
|
||||
Build IOR with the AzureHPC application script. You have to run in sudo mode as it install additional packages on the master in order to build it.
|
||||
|
||||
```
|
||||
[hpcadmin@ip-0A020804 ~]$ qsub -N build_ior -k oe -j oe -l select=1 -- /apps/ior/build_ior.sh
|
||||
0.ip-0A020804
|
||||
[hpcadmin@ip-0A020804 ~]$ qstat
|
||||
Job id Name User Time Use S Queue
|
||||
---------------- ---------------- ---------------- -------- - -----
|
||||
0.ip-0A020804 build_ior hpcadmin 0 H workq
|
||||
```
|
||||
Check that a new node is provisioned (unless you have already started one manually). Allow 13 minutes for the node to be ready.
|
||||
Output file will be named `build_ior.o*`
|
||||
|
||||
After the build check that you have an `ior` module in `/apps/modulefiles` and IOR binaries in `/apps/ior-<version>`
|
||||
|
||||
Run IOR from a compute node by submitting a job
|
||||
|
||||
```
|
||||
[hpcadmin@ip-0A020804 ~]$ qsub -N ior -k oe -j oe -l select=1 -- /apps/ior/ior.sh /beegfs
|
||||
0.ip-0A020804
|
||||
[hpcadmin@ip-0A020804 ~]$ qstat
|
||||
Job id Name User Time Use S Queue
|
||||
---------------- ---------------- ---------------- -------- - -----
|
||||
1.ip-0A020804 ior hpcadmin 0 Q workq
|
||||
```
|
||||
Output file will be named `ior.o*`
|
||||
|
||||
|
||||
## Step 4 - Run latency and bandwidth tests
|
||||
|
||||
```
|
||||
[hpcadmin@ip-0A020804 ~]$ qsub -N pingpong -k oe -j oe -l select=2:ncpus=1:mpiprocs=1,place=scatter:excl -- /apps/imb-mpi/ringpingpong.sh ompi
|
||||
[hpcadmin@ip-0A020804 ~]$ qsub -N allreduce -k oe -j oe -l select=2:ncpus=60:mpiprocs=60,place=scatter:excl -- /apps/imb-mpi/allreduce.sh impi2018
|
||||
[hpcadmin@ip-0A020804 ~]$ qsub -N osu -k oe -j oe -l select=2:ncpus=1:mpiprocs=1,place=scatter:excl -- /apps/osu/osu_bw.sh
|
||||
```
|
||||
Output files will be named `pingpong.o*, allreduce.o*, osu.o*`
|
||||
|
||||
## Step 5 - Build and run HPL
|
||||
|
||||
Submit the build, once the job is finish submit the run.
|
||||
```
|
||||
[hpcadmin@ip-0A020804 ~] qsub -N build_hpl -k oe -j oe -l select=1:ncpus=1:mpiprocs=1,place=scatter:excl -- /apps/linpack/build_hpl.sh
|
||||
[hpcadmin@ip-0A020804 ~] qsub -N single_hpl -k oe -j oe -l select=1:ncpus=1:mpiprocs=1,place=scatter:excl -- /apps/linpack/single_hpl.sh
|
||||
```
|
||||
|
||||
Output files will be named `build_hpl.o*, single_hpl.o*`
|
||||
|
||||
|
||||
# Remove all
|
||||
|
||||
## Step 1 - Optionally delete the PBS cluster
|
||||
|
||||
From your deployment machine run
|
||||
|
||||
```
|
||||
$ cyclecloud terminate_cluster pbscycle
|
||||
$ cyclecloud delete_cluster pbscycle
|
||||
```
|
||||
|
||||
## Step 2 - Drop all the resources
|
||||
|
||||
```
|
||||
$ azhpc-destroy --no-wait
|
||||
[2020-06-16 17:25:20] reading config file (config.json)
|
||||
[2020-06-16 17:25:20] warning: deleting entire resource group (xps-hack)
|
||||
[2020-06-16 17:25:20] you have 10s to change your mind and ctrl-c!
|
||||
[2020-06-16 17:25:30] too late!
|
||||
```
|
|
@ -0,0 +1,88 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"hpc_image": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"hpc_vm_type": "Standard_HB60rs",
|
||||
"htc_vm_type": "Standard_D16s_v3",
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vm_type": "Standard_HB60rs",
|
||||
"vnet_name": "hpcvnet",
|
||||
"vnet_resource_group": "variables.resource_group"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"cyclecloud": {
|
||||
"clusters": {
|
||||
"slurmcycle": {
|
||||
"template": "slurm_template_2.1.0",
|
||||
"parameters": {
|
||||
"MaxHPCExecuteCoreCount": 240,
|
||||
"MaxHTCExecuteCoreCount": 128,
|
||||
"HTCUseLowPrio" : false,
|
||||
"MasterMachineType": "Standard_D8s_v3",
|
||||
"HTCMachineType": "variables.htc_vm_type",
|
||||
"HPCMachineType": "variables.hpc_vm_type",
|
||||
"MasterImageName": "variables.htc_image",
|
||||
"HTCImageName": "variables.htc_image",
|
||||
"HPCImageName": "variables.hpc_image",
|
||||
"Credentials": "azure",
|
||||
"Autoscale": true,
|
||||
"SubnetId": "{{variables.resource_group}}/hpcvnet/compute",
|
||||
"UseLowPrio": false,
|
||||
"Region": "{{variables.location}}",
|
||||
"configuration_slurm_version" : "19.05.5-1",
|
||||
"MasterClusterInitSpecs": {
|
||||
"azurehpc:beegfs:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:beegfs:1.0.0",
|
||||
"Spec": "beegfs",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"HPCClusterInitSpecs": {
|
||||
"azurehpc:beegfs:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:beegfs:1.0.0",
|
||||
"Spec": "beegfs",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"HTCClusterInitSpecs": {
|
||||
"azurehpc:beegfs:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:beegfs:1.0.0",
|
||||
"Spec": "beegfs",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"projects": {
|
||||
"azurehpc:beegfs:1.0.0": [
|
||||
{
|
||||
"script": "beegfspkgs.sh"
|
||||
},
|
||||
{
|
||||
"script": "beegfsc.sh",
|
||||
"args": [
|
||||
"beegfsm"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"resources": {},
|
||||
"install": []
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
{
|
||||
"variables": {
|
||||
"resource_group": "my resource group",
|
||||
"location": "my location",
|
||||
"key_vault": "my key vault",
|
||||
"uuid": "unique value",
|
||||
"projectstore": "locker{{variables.uuid}}"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "jumpbox",
|
||||
"admin_user": "variables.admin_user",
|
||||
"vnet": {
|
||||
"name": "hpcvnet",
|
||||
"address_prefix": "10.2.0.0/20",
|
||||
"subnets": {
|
||||
"admin": "10.2.1.0/24",
|
||||
"storage": "10.2.3.0/24",
|
||||
"compute": "10.2.4.0/22"
|
||||
}
|
||||
},
|
||||
"variables": {
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"key_vault": "<NOT-SET>",
|
||||
"spn_name": "<NOT-SET>",
|
||||
"projectstore": "<NOT-SET>",
|
||||
"admin_user": "hpcadmin",
|
||||
"appid": "",
|
||||
"image": "OpenLogic:CentOS:7.7:latest"
|
||||
},
|
||||
"resources": {
|
||||
"cycleserver": {
|
||||
"type": "vm",
|
||||
"vm_type": "Standard_D8s_v3",
|
||||
"accelerated_networking": true,
|
||||
"public_ip": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "admin",
|
||||
"tags": [
|
||||
]
|
||||
},
|
||||
"jumpbox": {
|
||||
"type": "vm",
|
||||
"vm_type": "Standard_D8s_v3",
|
||||
"accelerated_networking": true,
|
||||
"public_ip": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "admin",
|
||||
"tags": [
|
||||
"jumpbox"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "disable-selinux.sh",
|
||||
"tag": "jumpbox",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "cndefault.sh",
|
||||
"tag": "jumpbox",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"type": "local_script",
|
||||
"script": "cc_nsg.sh",
|
||||
"args": [
|
||||
"variables.resource_group",
|
||||
"cycleserver"
|
||||
]
|
||||
},
|
||||
{
|
||||
"script": "cyclecli_install.sh",
|
||||
"tag": "jumpbox",
|
||||
"args": [
|
||||
"fqdn.cycleserver",
|
||||
"variables.admin_user",
|
||||
"secret.{{variables.key_vault}}.CycleAdminPassword",
|
||||
"variables.resource_group",
|
||||
"sakey.{{variables.projectstore}}"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,249 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "headnode",
|
||||
"admin_user": "hpcadmin",
|
||||
"vnet": {
|
||||
"name": "hpcvnet",
|
||||
"address_prefix": "10.2.0.0/20",
|
||||
"subnets": {
|
||||
"compute": "10.2.0.0/22",
|
||||
"storage": "10.2.4.0/24"
|
||||
}
|
||||
},
|
||||
"variables": {
|
||||
"resource_group": "<NOT-SET>",
|
||||
"location": "westeurope",
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"lustreimage": "OpenLogic:CentOS:7.7:latest",
|
||||
"hpcimage": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"compute_instances": 2,
|
||||
"lustre_instances": 4,
|
||||
"low_priority": false,
|
||||
"storage_account": "<NOT-SET>",
|
||||
"storage_key": "sakey.{{variables.storage_account}}",
|
||||
"storage_container": "<NOT-SET>",
|
||||
"log_analytics_lfs_name": "<NOT-SET>",
|
||||
"la_resourcegroup": "<NOT-SET>",
|
||||
"la_name": "<NOT-SET>",
|
||||
"log_analytics_workspace": "laworkspace.{{variables.la_resourcegroup}}.{{variables.la_name}}",
|
||||
"log_analytics_key": "lakey.{{variables.la_resourcegroup}}.{{variables.la_name}}",
|
||||
"lustre_version": "2.12.4",
|
||||
"lustre_mount": "/lustre",
|
||||
"lustre_sku": "Standard_L16s_v2",
|
||||
"lustre_avset": "{{variables.resource_group}}avset"
|
||||
},
|
||||
"resources": {
|
||||
"headnode": {
|
||||
"type": "vm",
|
||||
"vm_type": "Standard_D8s_v3",
|
||||
"accelerated_networking": true,
|
||||
"public_ip": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "compute",
|
||||
"tags": [
|
||||
"disable-selinux",
|
||||
"cndefault",
|
||||
"lfsrepo",
|
||||
"lfsclient",
|
||||
"lfsazimport",
|
||||
"localuser",
|
||||
"pbsserver",
|
||||
"loginnode",
|
||||
"nfsserver"
|
||||
]
|
||||
},
|
||||
"compute": {
|
||||
"type": "vmss",
|
||||
"vm_type": "Standard_D32s_v3",
|
||||
"accelerated_networking": true,
|
||||
"instances": "variables.compute_instances",
|
||||
"low_priority": "variables.low_priority",
|
||||
"image": "variables.hpcimage",
|
||||
"subnet": "compute",
|
||||
"tags": [
|
||||
"disable-selinux",
|
||||
"cndefault",
|
||||
"lfsrepo",
|
||||
"lfsclient",
|
||||
"localuser",
|
||||
"pbsclient",
|
||||
"nfsclient"
|
||||
]
|
||||
},
|
||||
"lustremaster": {
|
||||
"type": "vm",
|
||||
"vm_type": "variables.lustre_sku",
|
||||
"availability_set": "variables.lustre_avset",
|
||||
"accelerated_networking": true,
|
||||
"image": "variables.lustreimage",
|
||||
"subnet": "storage",
|
||||
"tags": [
|
||||
"cndefault",
|
||||
"lustremaster",
|
||||
"lustre",
|
||||
"lfsrepo",
|
||||
"disable-selinux",
|
||||
"lfsloganalytics"
|
||||
]
|
||||
},
|
||||
"lustre": {
|
||||
"type": "vmss",
|
||||
"vm_type": "variables.lustre_sku",
|
||||
"instances": "variables.lustre_instances",
|
||||
"availability_set": "variables.lustre_avset",
|
||||
"accelerated_networking": true,
|
||||
"image": "variables.lustreimage",
|
||||
"subnet": "storage",
|
||||
"tags": [
|
||||
"cndefault",
|
||||
"lustre",
|
||||
"lfsrepo",
|
||||
"disable-selinux",
|
||||
"lfsloganalytics"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "disable-selinux.sh",
|
||||
"tag": "disable-selinux",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "cndefault.sh",
|
||||
"tag": "cndefault",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "install-nfsserver.sh",
|
||||
"tag": "nfsserver",
|
||||
"args": [
|
||||
"/share"
|
||||
],
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "nfsclient.sh",
|
||||
"args": [
|
||||
"$(<hostlists/tags/nfsserver)"
|
||||
],
|
||||
"tag": "nfsclient",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "localuser.sh",
|
||||
"args": [
|
||||
"$(<hostlists/tags/nfsserver)"
|
||||
],
|
||||
"tag": "localuser",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "create_raid0.sh",
|
||||
"args": [
|
||||
"/dev/md10",
|
||||
"/dev/nvme*n1"
|
||||
],
|
||||
"tag": "lustre",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "lfsrepo.sh",
|
||||
"tag": "lfsrepo",
|
||||
"args": [
|
||||
"variables.lustre_version"
|
||||
],
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "lfspkgs.sh",
|
||||
"tag": "lustre",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "lfsmaster.sh",
|
||||
"tag": "lustremaster",
|
||||
"args": [
|
||||
"/dev/sdb"
|
||||
],
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "lfsoss.sh",
|
||||
"args": [
|
||||
"lustremaster",
|
||||
"/dev/md10"
|
||||
],
|
||||
"tag": "lustre",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "lfshsm.sh",
|
||||
"args": [
|
||||
"lustremaster",
|
||||
"variables.storage_account",
|
||||
"variables.storage_key",
|
||||
"variables.storage_container",
|
||||
"variables.lustre_version"
|
||||
],
|
||||
"tag": "lustre",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "lfsclient.sh",
|
||||
"args": [
|
||||
"lustremaster",
|
||||
"variables.lustre_mount"
|
||||
],
|
||||
"tag": "lfsclient",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "lfsimport.sh",
|
||||
"args": [
|
||||
"variables.storage_account",
|
||||
"variables.storage_key",
|
||||
"variables.storage_container",
|
||||
"variables.lustre_mount",
|
||||
"variables.lustre_version"
|
||||
],
|
||||
"tag": "lfsazimport",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "lfsloganalytics.sh",
|
||||
"args": [
|
||||
"variables.log_analytics_lfs_name",
|
||||
"variables.log_analytics_workspace",
|
||||
"variables.log_analytics_key"
|
||||
],
|
||||
"tag": "lfsloganalytics",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "pbsdownload.sh",
|
||||
"tag": "loginnode",
|
||||
"sudo": false
|
||||
},
|
||||
{
|
||||
"script": "pbsserver.sh",
|
||||
"copy": [
|
||||
"pbspro_19.1.1.centos7/pbspro-server-19.1.1-0.x86_64.rpm"
|
||||
],
|
||||
"tag": "pbsserver",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "pbsclient.sh",
|
||||
"args": [
|
||||
"$(<hostlists/tags/pbsserver)"
|
||||
],
|
||||
"copy": [
|
||||
"pbspro_19.1.1.centos7/pbspro-execution-19.1.1-0.x86_64.rpm"
|
||||
],
|
||||
"tag": "pbsclient",
|
||||
"sudo": true
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
pr: none
|
||||
trigger:
|
||||
batch: true
|
||||
branches:
|
||||
include:
|
||||
- master
|
||||
- '*'
|
||||
paths:
|
||||
include:
|
||||
- examples/lustre_avset/*
|
||||
- scripts/lfsrepo.sh
|
||||
- scripts/lfspkgs.sh
|
||||
- scripts/lfsmaster.sh
|
||||
- scripts/lfsoss.sh
|
||||
- scripts/lfshsm.sh
|
||||
- scripts/lfsclient.sh
|
||||
- scripts/lfsimport.sh
|
||||
- scripts/lfsloganalytics.sh
|
||||
exclude:
|
||||
- examples/lustre_avset/readme.md
|
||||
|
||||
pool: Default
|
||||
|
||||
variables:
|
||||
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
||||
# configuration file to use
|
||||
azhpc.config: examples/lustre_avset/config.json
|
||||
# pipeline directory
|
||||
azhpc.pipeline_dir: examples/lustre_avset
|
||||
# destination of scripts. Default is hpcuser@headnode:/apps
|
||||
#azhpc.script_remote_dest: 'hpcadmin@headnode:.'
|
||||
|
||||
# Add the variables needed in your configuration file
|
||||
# Uncomment and set values below, or leave commented and thru pipeline variables
|
||||
# azhpc.variables.location: westeurope
|
||||
azhpc.variables.compute_instances: 4
|
||||
azhpc.variables.low_priority: true
|
||||
azhpc.variables.lustre_instances: 4
|
||||
# azhpc.variables.la_name: analytics_workspace_name
|
||||
# azhpc.variables.la_resourcegroup: analytics_resource_group
|
||||
azhpc.variables.log_analytics_lfs_name: lfs
|
||||
azhpc.variables.lustre_mount: /lustre
|
||||
# azhpc.variables.storage_account: account
|
||||
# azhpc.variables.storage_container: container
|
||||
|
||||
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
||||
|
||||
jobs:
|
||||
- template: ../../ci/templates/jobs/build.yml
|
||||
parameters:
|
||||
extra_steps: ../../../examples/lustre_avset/test.yml
|
|
@ -0,0 +1,29 @@
|
|||
# Lustre Cluster
|
||||
![Build Status](https://azurecat.visualstudio.com/hpccat/_apis/build/status/azhpc/examples/lustre_avset?branchName=master)
|
||||
|
||||
Visualisation: [config.json](https://azurehpc.azureedge.net/?o=https://raw.githubusercontent.com/Azure/azurehpc/master/examples/lustre_avset/config.json)
|
||||
|
||||
This is a Lustre setup where an Availability Set is used.
|
||||
|
||||
Resources:
|
||||
|
||||
* Head node (headnode)
|
||||
* Compute nodes (compute)
|
||||
* Lustre scaleset
|
||||
* Management/Meta-data server on first node using resource disk
|
||||
* Object storage servers using all the NVME in a RAID 0
|
||||
* Hierarchical storage management daemon on all OSS nodes
|
||||
|
||||
The configuration file requires the following variables to be set:
|
||||
|
||||
| Variable | Description |
|
||||
|-------------------------|----------------------------------------------|
|
||||
| resource_group | The resource group for the project |
|
||||
| storage_account | The storage account for HSM |
|
||||
| storage_key | The storage key for HSM |
|
||||
| storage_container | The container to use for HSM |
|
||||
| log_analytics_lfs_name | The name to use in log analytics |
|
||||
| log_analytics_workspace | The log analytics workspace id |
|
||||
| log_analytics_key | The log analytics key |
|
||||
|
||||
> Note: Macros exist to get the `storage_key` using `sakey.<storage-account-name>`, `log_analytics_workspace` using `laworkspace.<resource-group>.<workspace-name>` and `log_analytics_key` using `lakey.<resource-group>.<workspace-name>`.
|
|
@ -0,0 +1,4 @@
|
|||
#!/bin/bash
|
||||
|
||||
yum install -y mpich
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
steps:
|
||||
- template: ../../ci/templates/steps/azhpc-run.yml
|
||||
parameters:
|
||||
user: hpcuser
|
||||
command: /apps/ci/check_pbs_nodes.sh $(azhpc.variables.compute_instances)
|
||||
|
||||
- template: ../../ci/templates/steps/azhpc-run.yml
|
||||
parameters:
|
||||
user: hpcadmin
|
||||
command: /apps/ci/check_mountpoints.sh $(azhpc.variables.lustre_mount)
|
||||
|
||||
- template: ../../ci/templates/steps/azhpc-run.yml
|
||||
parameters:
|
||||
user: hpcadmin
|
||||
command: /apps/ci/check_lustre_client.sh $(azhpc.variables.lustre_mount)
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "",
|
||||
"admin_user": "hpcadmin",
|
||||
"vnet": {
|
||||
"name": "myvnet",
|
||||
"address_prefix": "10.2.0.0/24",
|
||||
"subnets": {
|
||||
"compute": "10.2.0.0/24"
|
||||
}
|
||||
},
|
||||
"variables": {
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>"
|
||||
},
|
||||
"storage": {
|
||||
"uniquestoragename": {
|
||||
"type": "storageaccount",
|
||||
"sku": "Standard_LRS",
|
||||
"containers": [
|
||||
"mycontainer1",
|
||||
"mycontainer2"
|
||||
]
|
||||
}
|
||||
},
|
||||
"resources": {
|
||||
},
|
||||
"install": []
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "variables.jumpbox",
|
||||
"admin_user": "variables.admin_user",
|
||||
"variables": {
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"jumpbox": "fqdn.jumpbox",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"vnet_name": "hpcvnet",
|
||||
"key_vault": "<NOT-SET>",
|
||||
"admin_user": "hpcadmin",
|
||||
"ad_image": "MicrosoftWindowsServer:WindowsServer:2019-Datacenter:latest",
|
||||
"ad_domain": "hpc.local",
|
||||
"ad_vm_type": "Standard_D4s_v3"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"resources": {
|
||||
"adnode": {
|
||||
"type": "vm",
|
||||
"public_ip": false,
|
||||
"password": "secret.{{variables.key_vault}}.DomainAdminPassword",
|
||||
"vm_type": "variables.ad_vm_type",
|
||||
"os_storage_sku": "StandardSSD_LRS",
|
||||
"os_disk_size": 127,
|
||||
"image": "variables.ad_image",
|
||||
"subnet": "admin",
|
||||
"tags": []
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "setup_win_ad.sh",
|
||||
"type": "local_script",
|
||||
"args": [
|
||||
"variables.resource_group",
|
||||
"adnode",
|
||||
"variables.ad_domain",
|
||||
"variables.admin_user",
|
||||
"secret.{{variables.key_vault}}.DomainAdminPassword"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"resource_group": "<NOT-SET>",
|
||||
"location": "<NOT-SET>",
|
||||
"vnet_name": "hpcvnet",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"anf_pool_size_TB": 4,
|
||||
"anf_vol_size_TB": 4,
|
||||
"anf_mount_point": "/netapp"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"storage": {
|
||||
"hpcnetapp": {
|
||||
"type": "anf",
|
||||
"subnet": "netapp",
|
||||
"pools": {
|
||||
"anfpool": {
|
||||
"size": "variables.anf_pool_size_TB",
|
||||
"service_level": "Premium",
|
||||
"volumes": {
|
||||
"anfvol": {
|
||||
"size": "variables.anf_vol_size_TB",
|
||||
"mount": "variables.anf_mount_point"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"resources": {},
|
||||
"install": []
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "cycleserver",
|
||||
"admin_user": "variables.admin_user",
|
||||
"variables": {
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"vnet_name": "hpcvnet",
|
||||
"key_vault": "<NOT-SET>",
|
||||
"spn_name": "<NOT-SET>",
|
||||
"projectstore": "<NOT-SET>",
|
||||
"admin_user": "hpcadmin",
|
||||
"appid": "",
|
||||
"tenantid": "<NOT-SET>",
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"cc_vm_type": "Standard_D8s_v3"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"resources": {
|
||||
"cycleserver": {
|
||||
"type": "vm",
|
||||
"vm_type": "variables.cc_vm_type",
|
||||
"accelerated_networking": true,
|
||||
"public_ip": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "admin",
|
||||
"tags": [
|
||||
"disable-selinux",
|
||||
"cycle"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "disable-selinux.sh",
|
||||
"tag": "disable-selinux",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "cc_install.sh",
|
||||
"tag": "cycle",
|
||||
"sudo": true,
|
||||
"args": [
|
||||
"cycleserver",
|
||||
"secret.{{variables.key_vault}}.{{variables.spn_name}}",
|
||||
"variables.appid",
|
||||
"variables.tenantid",
|
||||
"variables.admin_user",
|
||||
"secret.{{variables.key_vault}}.CycleAdminPassword",
|
||||
"variables.projectstore"
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "local_script",
|
||||
"script": "cc_nsg.sh",
|
||||
"args": [
|
||||
"variables.resource_group",
|
||||
"cycleserver"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "",
|
||||
"admin_user": "variables.admin_user",
|
||||
"vnet": {
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"variables": {
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"key_vault": "<NOT-SET>",
|
||||
"spn_name": "<NOT-SET>",
|
||||
"projectstore": "<NOT-SET>",
|
||||
"admin_user": "hpcadmin",
|
||||
"vnet_name": "hpcvnet",
|
||||
"appid": "<NOT-SET>"
|
||||
},
|
||||
"resources": {},
|
||||
"install": [
|
||||
{
|
||||
"type": "local_script",
|
||||
"script": "cc_prereqs.sh",
|
||||
"args": [
|
||||
"variables.resource_group",
|
||||
"variables.key_vault",
|
||||
"variables.spn_name",
|
||||
"variables.projectstore",
|
||||
"variables.appid"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,151 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "variables.jumpbox",
|
||||
"admin_user": "hpcadmin",
|
||||
"vnet": {
|
||||
"name": "hpcvnet",
|
||||
"address_prefix": "10.2.0.0/20",
|
||||
"subnets": {
|
||||
"compute": "10.2.0.0/22",
|
||||
"storage": "10.2.4.0/24"
|
||||
}
|
||||
},
|
||||
"variables": {
|
||||
"resource_group": "<NOT-SET>",
|
||||
"location": "<NOT-SET>",
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"jumpbox": "fqdn.jumpbox",
|
||||
"glusterfsimage": "OpenLogic:CentOS:7.7:latest",
|
||||
"hpcimage": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"compute_vm_type": "Standard_HB60rs",
|
||||
"glusterfs_instances": 4,
|
||||
"low_priority": false,
|
||||
"glusterfs_mount": "/glusterfs",
|
||||
"glusterfs_stripe": 1,
|
||||
"glusterfs_replica": 0,
|
||||
"key_vault": "hpccatkv",
|
||||
"projectstore": "lockerfff41f"
|
||||
},
|
||||
"resources": {
|
||||
"glusterfsmaster": {
|
||||
"type": "vm",
|
||||
"vm_type": "Standard_L16s_v2",
|
||||
"accelerated_networking": true,
|
||||
"image": "variables.glusterfsimage",
|
||||
"subnet": "storage",
|
||||
"tags": [
|
||||
"cndefault",
|
||||
"disable-selinux",
|
||||
"create_raid0",
|
||||
"make_filesystem",
|
||||
"glusterfs_pkgs",
|
||||
"glusterfs_pool_master",
|
||||
"glusterfs_vol"
|
||||
]
|
||||
},
|
||||
"glusterfs": {
|
||||
"type": "vmss",
|
||||
"vm_type": "Standard_L16s_v2",
|
||||
"instances": "variables.glusterfs_instances",
|
||||
"accelerated_networking": true,
|
||||
"image": "variables.glusterfsimage",
|
||||
"subnet": "storage",
|
||||
"tags": [
|
||||
"cndefault",
|
||||
"disable-selinux",
|
||||
"create_raid0",
|
||||
"make_filesystem",
|
||||
"glusterfs_pkgs",
|
||||
"glusterfs_pool",
|
||||
"glusterfs_vol"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "disable-selinux.sh",
|
||||
"tag": "disable-selinux",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "cndefault.sh",
|
||||
"tag": "cndefault",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "install-nfsserver.sh",
|
||||
"tag": "nfsserver",
|
||||
"args": [
|
||||
"/share"
|
||||
],
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "nfsclient.sh",
|
||||
"args": [
|
||||
"$(<hostlists/tags/nfsserver)"
|
||||
],
|
||||
"tag": "nfsclient",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "localuser.sh",
|
||||
"args": [
|
||||
"$(<hostlists/tags/nfsserver)"
|
||||
],
|
||||
"tag": "localuser",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "create_raid0.sh",
|
||||
"args": [
|
||||
"/dev/md20",
|
||||
"/dev/nvme*n1"
|
||||
],
|
||||
"tag": "create_raid0",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "make_filesystem.sh",
|
||||
"args": [
|
||||
"/dev/md20",
|
||||
"xfs",
|
||||
"/mnt/brick1"
|
||||
],
|
||||
"tag": "make_filesystem",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "glusterfs_pkgs.sh",
|
||||
"tag": "glusterfs_pkgs",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "glusterfs_pool.sh",
|
||||
"tag": "glusterfs_pool",
|
||||
"args": [
|
||||
"$(<hostlists/glusterfs)"
|
||||
],
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "glusterfs_pool_master.sh",
|
||||
"tag": "glusterfs_pool_master",
|
||||
"args": [
|
||||
"$(<hostlists/glusterfs)"
|
||||
],
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "glusterfs_vol.sh",
|
||||
"tag": "glusterfs_vol",
|
||||
"args": [
|
||||
"$(<hostlists/glusterfs)",
|
||||
"variables.glusterfs_stripe",
|
||||
"variables.glusterfs_replica"
|
||||
],
|
||||
"sudo": true
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,96 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "variables.jumpbox",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"jumpbox": "fqdn.jumpbox",
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vnet_name": "hpcvnet",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"glusterfs_storage_instances": 2,
|
||||
"glusterfs_storage_vm_type": "Standard_L16s_v2",
|
||||
"glusterfs_mount": "/glusterfs",
|
||||
"glusterfs_stripe": 1,
|
||||
"glusterfs_replica": 0
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"resources": {
|
||||
"glusterfs": {
|
||||
"type": "vmss",
|
||||
"vm_type": "variables.glusterfs_storage_vm_type",
|
||||
"instances": "variables.glusterfs_storage_instances",
|
||||
"accelerated_networking": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "storage",
|
||||
"tags": [
|
||||
"cndefault",
|
||||
"disable-selinux",
|
||||
"create_raid0",
|
||||
"make_filesystem",
|
||||
"glusterfs_pkgs",
|
||||
"glusterfs_pool",
|
||||
"glusterfs_vol"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "disable-selinux.sh",
|
||||
"tag": "disable-selinux",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "cndefault.sh",
|
||||
"tag": "cndefault",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "create_raid0.sh",
|
||||
"args": [
|
||||
"/dev/md20",
|
||||
"/dev/nvme*n1"
|
||||
],
|
||||
"tag": "create_raid0",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "make_filesystem.sh",
|
||||
"args": [
|
||||
"/dev/md20",
|
||||
"xfs",
|
||||
"/mnt/brick1"
|
||||
],
|
||||
"tag": "make_filesystem",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "glusterfs_pkgs.sh",
|
||||
"tag": "glusterfs_pkgs",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "glusterfs_pool.sh",
|
||||
"tag": "glusterfs_pool",
|
||||
"args": [
|
||||
"$(<hostlists/glusterfs)"
|
||||
],
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "glusterfs_vol.sh",
|
||||
"tag": "glusterfs_vol",
|
||||
"args": [
|
||||
"$(<hostlists/glusterfs)",
|
||||
"variables.glusterfs_stripe",
|
||||
"variables.glusterfs_replica"
|
||||
],
|
||||
"sudo": true
|
||||
}
|
||||
]
|
||||
}
|
|
@ -14,38 +14,53 @@
|
|||
"jumpbox": "fqdn.jumpbox",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"lustre_image": "OpenLogic:CentOS:7.7:latest",
|
||||
"lustre_vm_type": "Standard_L8s_v2",
|
||||
"lustre_instances": 4,
|
||||
"lustre_vm_type": "Standard_L16s_v2",
|
||||
"lustre_instances": 2,
|
||||
"lustre_storage_account": "<NOT-SET>",
|
||||
"lustre_storage_container": "<NOT-SET>",
|
||||
"lustre_version": "2.12.4",
|
||||
"lustre_mount": "/lustre"
|
||||
"lustre_storage_container": "hsm",
|
||||
"lustre_version": "<NOT-SET>",
|
||||
"lustre_mount": "/lustre",
|
||||
"lustre_avset": "{{variables.resource_group}}avset"
|
||||
},
|
||||
"resources": {
|
||||
"lustremaster": {
|
||||
"type": "vm",
|
||||
"vm_type": "variables.lustre_vm_type",
|
||||
"availability_set": "variables.lustre_avset",
|
||||
"accelerated_networking": true,
|
||||
"image": "variables.lustre_image",
|
||||
"subnet": "storage",
|
||||
"tags": [
|
||||
"cndefault",
|
||||
"lustremaster",
|
||||
"lustre",
|
||||
"lfsrepo",
|
||||
"disable-selinux"
|
||||
]
|
||||
},
|
||||
"lustre": {
|
||||
"type": "vmss",
|
||||
"vm_type": "variables.lustre_vm_type",
|
||||
"instances": "variables.lustre_instances",
|
||||
"availability_set": "variables.lustre_avset",
|
||||
"accelerated_networking": true,
|
||||
"image": "variables.lustreimage",
|
||||
"image": "variables.lustre_image",
|
||||
"subnet": "storage",
|
||||
"tags": [
|
||||
"cndefault",
|
||||
"lustre",
|
||||
"lfsrepo",
|
||||
"disable-selinux"
|
||||
"lfsrepo"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "disable-selinux.sh",
|
||||
"tag": "disable-selinux",
|
||||
"tag": "lustre",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "cndefault.sh",
|
||||
"tag": "cndefault",
|
||||
"tag": "lustre",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
|
@ -72,7 +87,7 @@
|
|||
},
|
||||
{
|
||||
"script": "lfsmaster.sh",
|
||||
"tag": "lustre",
|
||||
"tag": "lustremaster",
|
||||
"args": [
|
||||
"/dev/sdb"
|
||||
],
|
||||
|
@ -81,7 +96,7 @@
|
|||
{
|
||||
"script": "lfsoss.sh",
|
||||
"args": [
|
||||
"$(head -n1 hostlists/tags/lustre)",
|
||||
"lustremaster",
|
||||
"/dev/md10"
|
||||
],
|
||||
"tag": "lustre",
|
||||
|
@ -90,7 +105,7 @@
|
|||
{
|
||||
"script": "lfshsm.sh",
|
||||
"args": [
|
||||
"$(head -n1 hostlists/tags/lustre)",
|
||||
"lustremaster",
|
||||
"variables.lustre_storage_account",
|
||||
"sakey.{{variables.lustre_storage_account}}",
|
||||
"variables.lustre_storage_container",
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"hpc_image": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vm_type": "Standard_HB60rs",
|
||||
"vnet_name": "<NOT-SET>",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"anf_mount_host": "<NOT-SET>",
|
||||
"anf_mount_point": "/netapps"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"cyclecloud": {
|
||||
"clusters": {
|
||||
"pbs_anf_cycle": {
|
||||
"template": "pbspro_template_1.3.5",
|
||||
"parameters": {
|
||||
"MaxExecuteCoreCount": 1000,
|
||||
"MasterMachineType": "Standard_D8s_v3",
|
||||
"Credentials": "azure",
|
||||
"Autoscale": true,
|
||||
"SubnetId": "{{variables.resource_group}}/{{variables.vnet_name}}/compute",
|
||||
"UseLowPrio": false,
|
||||
"UsePublicNetwork": false,
|
||||
"ReturnProxy": false,
|
||||
"Region": "variables.location",
|
||||
"MasterClusterInitSpecs": {
|
||||
"azurehpc:anf:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:anf:1.0.0",
|
||||
"Spec": "anf",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"ExecuteMachineType": "variables.vm_type",
|
||||
"ImageName": "variables.hpc_image",
|
||||
"ExecuteClusterInitSpecs": {
|
||||
"azurehpc:anf:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:anf:1.0.0",
|
||||
"Spec": "anf",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"projects": {
|
||||
"azurehpc:anf:1.0.0": [
|
||||
{
|
||||
"script": "auto_mount.sh",
|
||||
"args": [
|
||||
"variables.anf_mount_host",
|
||||
"variables.anf_mount_point"
|
||||
]
|
||||
},
|
||||
{
|
||||
"script": "replace_nfs_with_anf.sh",
|
||||
"args": [ "variables.anf_mount_point" ]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"resources": {
|
||||
},
|
||||
"install": [
|
||||
]
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
# Azure NetApps Files and CycleCloud Integration
|
||||
|
||||
Outlines the procedure to access a Azure NetApp Files deployed by AzureHPC in CycleCloud (PBS or SLURM).
|
||||
|
||||
## Pre-requisites:
|
||||
|
||||
* An installed and setup Azure CycleCloud Application Server (instructions [here](https://docs.microsoft.com/en-us/azure/cyclecloud/quickstart-install-cyclecloud) or using the [azurehpc script](https://github.com/Azure/azurehpc/tree/master/examples/cycleserver))
|
||||
* The Azure CycleCloud CLI (instructions [here](https://docs.microsoft.com/en-us/azure/cyclecloud/install-cyclecloud-cli))
|
||||
* Azure NetApp Files (ANF) deployed with AzureHPC ([examples/anf_full](https://github.com/Azure/azurehpc/tree/hackathon_june_2020/examples/anf_full)).
|
||||
|
||||
## Overview of procedure
|
||||
|
||||
The "azhpc ccbuild" command will use a config file to generate AzureHPC projects/Specs and upload them to your default CycleCloud locker. A CycleCloud template parameter file will also be generated based on the parameters you specify in the config file. A default CycleCloud template (PBS or SLURM) (i.e no editing the CC template) will be used to start a CycleCloud cluster using the generated template parameter json file.
|
||||
|
||||
## Update the `anf_cycle.json` file (pick pbs or slurm as your preferred scheduler)
|
||||
|
||||
Azurehpc provides the `azhpc-init` command that can help here by copying the directory and substituting the unset variables. First run with the `-s` parameter to see which variables need to be set:
|
||||
|
||||
```
|
||||
$ azhpc init -c $azhpc_dir/examples/cc_anf/pbs_anf_cycle.json -d cc_anf -s
|
||||
```
|
||||
|
||||
The variables can be set with the `-v` option where variables are comma separated. The output from the previous command as a starting point. The `-d` option is required and will create a new directory name for you. Please update to whatever `resource_group` you would like to deploy to:
|
||||
|
||||
```
|
||||
$ azhpc-init -c $azhpc_dir/examples/cc_anf/pbs_anf_cycle.json -d cc_anf -v resource_group=azurehpc-cc
|
||||
```
|
||||
NOTE: To make sure the value of 'template' is correctly set, ex:
|
||||
```
|
||||
"template": "pbspro_template_1.3.5",
|
||||
```
|
||||
you can run below to get existing templates in your CycleCloud server:
|
||||
```
|
||||
$ cyclecloud show_cluster -t
|
||||
```
|
||||
|
||||
|
||||
## Create CycleCloud Cluster with AzureHPC ANF
|
||||
|
||||
```
|
||||
$ cd cc_anf
|
||||
$ azhpc ccbuild -c pbs_anf_cycle.json
|
||||
```
|
||||
|
||||
## Start CycleCloud Cluster
|
||||
Go to CycleCloud server portal, find your CycleCloud cluster and click on start.
|
||||
|
||||
## Connect to the master node of your cluster, and then check that ANF is mounted.
|
||||
|
||||
```
|
||||
[hpcadmin@jumpbox examples]$ cyclecloud connect master -c anfcycle
|
||||
Connecting to hpcadmin@10.2.4.9 (anfcycle master) using SSH
|
||||
Last login: Thu Jun 11 09:16:37 2020 from 10.22.1.4
|
||||
|
||||
__ __ | ___ __ | __ __|
|
||||
(___ (__| (___ |_, (__/_ (___ |_, (__) (__(_ (__|
|
||||
|
|
||||
|
||||
Cluster: anfcycle
|
||||
Version: 7.9.5
|
||||
Run List: recipe[cyclecloud], role[pbspro_master_role], recipe[cluster_init]
|
||||
[hpcadmin@ip-0A020409 ~]$ df -h
|
||||
Filesystem Size Used Avail Use% Mounted on
|
||||
devtmpfs 16G 0 16G 0% /dev
|
||||
tmpfs 16G 0 16G 0% /dev/shm
|
||||
tmpfs 16G 9.1M 16G 1% /run
|
||||
tmpfs 16G 0 16G 0% /sys/fs/cgroup
|
||||
/dev/sda2 30G 9.8G 20G 34% /
|
||||
/dev/sda1 494M 65M 430M 13% /boot
|
||||
/dev/sda15 495M 12M 484M 3% /boot/efi
|
||||
/dev/sdb1 63G 53M 60G 1% /mnt/resource
|
||||
10.2.8.4:/raymondanfvol 100T 448K 100T 1% /netapps
|
||||
tmpfs 3.2G 0 3.2G 0% /run/user/20003
|
||||
```
|
|
@ -0,0 +1,79 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"hpc_image": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vm_type": "Standard_HB60rs",
|
||||
"vnet_name": "<NOT-SET>",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"anf_mount_host": "<NOT-SET>",
|
||||
"anf_mount_point": "/netapps"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"cyclecloud": {
|
||||
"clusters": {
|
||||
"slurm_anf_cycle": {
|
||||
"template": "slurm_template_2.1.0",
|
||||
"parameters": {
|
||||
"MaxExecuteCoreCount": 1000,
|
||||
"MasterMachineType": "Standard_D8s_v3",
|
||||
"Credentials": "azure",
|
||||
"Autoscale": true,
|
||||
"SubnetId": "{{variables.resource_group}}/{{variables.vnet_name}}/compute",
|
||||
"UseLowPrio": false,
|
||||
"UsePublicNetwork": false,
|
||||
"ReturnProxy": false,
|
||||
"Region": "variables.location",
|
||||
"MasterClusterInitSpecs": {
|
||||
"azurehpc:anf:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:anf:1.0.0",
|
||||
"Spec": "anf",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"ExecuteMachineType": "variables.vm_type",
|
||||
"ImageName": "variables.hpc_image",
|
||||
"ExecuteClusterInitSpecs": {
|
||||
"azurehpc:anf:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:anf:1.0.0",
|
||||
"Spec": "anf",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"projects": {
|
||||
"azurehpc:anf:1.0.0": [
|
||||
{
|
||||
"script": "auto_mount.sh",
|
||||
"args": [
|
||||
"variables.anf_mount_host",
|
||||
"variables.anf_mount_point"
|
||||
]
|
||||
},
|
||||
{
|
||||
"script": "replace_nfs_with_anf.sh",
|
||||
"args": [ "variables.anf_mount_point" ]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"resources": {
|
||||
},
|
||||
"install": [
|
||||
]
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
#/bin/bash
|
||||
uuid_str="$(cat /proc/sys/kernel/random/uuid | tr -d '\n-' | tr '[:upper:]' '[:lower:]' | cut -c 1-6)"
|
||||
locker="locker$uuid_str"
|
||||
|
||||
jq '.projectstore=$locker' --arg locker $locker variables.json > temp.json
|
||||
cp temp.json variables.json
|
||||
|
||||
function init_config()
|
||||
{
|
||||
local config=$1
|
||||
azhpc-init -d . -c $config
|
||||
config_file=${config##*/}
|
||||
cp $config_file temp.json
|
||||
jq '.variables+=$variables' --argjson variables "$(cat variables.json)" temp.json > $config_file
|
||||
}
|
||||
|
||||
blocks="vnet.json jumpbox.json ad.json cycle-prereqs-managed-identity.json cycle-install-server-managed-identity.json cycle-cli-local.json cycle-cli-jumpbox.json beegfs-cluster.json"
|
||||
for block in $blocks; do
|
||||
echo "initializing config for $block"
|
||||
init_config $azhpc_dir/experimental/blocks/$block
|
||||
done
|
||||
|
||||
init_config $azhpc_dir/experimental/cc_beegfs_ad/pbscycle.json
|
|
@ -0,0 +1,104 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"hpc_image": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vm_type": "Standard_HB60rs",
|
||||
"vnet_name": "hpcvnet",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"key_vault": "<NOT-SET>",
|
||||
"admin_user": "hpcadmin",
|
||||
"ad_domain": "hpc.local"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"cyclecloud": {
|
||||
"clusters": {
|
||||
"pbscycle": {
|
||||
"template": "pbspro_template_1.3.7",
|
||||
"parameters": {
|
||||
"MaxExecuteCoreCount": 1000,
|
||||
"MasterMachineType": "Standard_D8s_v3",
|
||||
"Credentials": "azure",
|
||||
"Autoscale": true,
|
||||
"SubnetId": "{{variables.resource_group}}/{{variables.vnet_name}}/compute",
|
||||
"UseLowPrio": false,
|
||||
"UsePublicNetwork": false,
|
||||
"ReturnProxy": false,
|
||||
"Region": "variables.location",
|
||||
"MasterClusterInitSpecs": {
|
||||
"azurehpc:beegfs:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:beegfs:1.0.0",
|
||||
"Spec": "beegfs",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
},
|
||||
"azurehpc:adjoin:1.0.0": {
|
||||
"Order": 10100,
|
||||
"Name": "azurehpc:adjoin:1.0.0",
|
||||
"Spec": "adjoin",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"ExecuteMachineType": "variables.vm_type",
|
||||
"ImageName": "variables.hpc_image",
|
||||
"ExecuteClusterInitSpecs": {
|
||||
"azurehpc:beegfs:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:beegfs:1.0.0",
|
||||
"Spec": "beegfs",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
},
|
||||
"azurehpc:adjoin:1.0.0": {
|
||||
"Order": 10100,
|
||||
"Name": "azurehpc:adjoin:1.0.0",
|
||||
"Spec": "adjoin",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"projects": {
|
||||
"azurehpc:beegfs:1.0.0": [
|
||||
{
|
||||
"script": "beegfspkgs.sh"
|
||||
},
|
||||
{
|
||||
"script": "beegfsc.sh",
|
||||
"args": [
|
||||
"beegfsm"
|
||||
]
|
||||
}
|
||||
],
|
||||
"azurehpc:adjoin:1.0.0": [
|
||||
{
|
||||
"script": "join_domain_simple.sh",
|
||||
"args": [
|
||||
"variables.ad_domain",
|
||||
"variables.admin_user",
|
||||
"secret.{{variables.key_vault}}.DomainAdminPassword",
|
||||
"adnode"
|
||||
],
|
||||
"deps": [ "mkhomedir.pp" ]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"resources": {},
|
||||
"install": []
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
# Building the infrastructure
|
||||
Here we will explain how to deploy a full system with a VNET, JUMPBOX, Active Directory, CYCLESERVER and BEEGFS by using building blocks. These blocks are stored into the experimental/blocks directory.
|
||||
|
||||
## Step 1 - install azhpc
|
||||
after cloning azhpc, source the install.sh script
|
||||
|
||||
## Step 2 - Initialize the configuration files
|
||||
Create a working directory from where you will do the deployment and configuration update. Don't work directly from the cloned repo.
|
||||
|
||||
```
|
||||
$ mkdir cluster
|
||||
$ cd cluster
|
||||
```
|
||||
|
||||
Then copy the init.sh and variables.json from experimental/cc_beegfs_ad to your working directory.
|
||||
|
||||
```
|
||||
$ cp $azhpc_dir/experimental/cc_beegfs_ad/init.sh .
|
||||
$ cp $azhpc_dir/experimental/cc_beegfs_ad/variables.json .
|
||||
```
|
||||
|
||||
Edit the variables.json to match your environment. Leave the projectstore empty as it will be filled up with a random value by the init script. An existing keyvault should be referenced as it won't be created for you.
|
||||
|
||||
```json
|
||||
{
|
||||
"resource_group": "my resource group",
|
||||
"location": "westeurope",
|
||||
"key_vault": "my key vault",
|
||||
"projectstore": ""
|
||||
}
|
||||
```
|
||||
|
||||
Run the init.sh script which will copy all the config files of the building blocks and initialize the variables by using the variables.json updated above.
|
||||
|
||||
```
|
||||
$ ./init.sh
|
||||
```
|
||||
|
||||
## Step 2 - Build the system
|
||||
|
||||
```
|
||||
$ azhpc-build -c vnet.json
|
||||
$ azhpc-build --no-vnet -c jumpbox.json
|
||||
$ azhpc-build --no-vnet -c ad.json
|
||||
$ azhpc-build --no-vnet -c cycle-prereqs-managed-identity.json
|
||||
$ azhpc-build --no-vnet -c cycle-install-server-managed-identity.json
|
||||
```
|
||||
|
||||
## Step 3 - Connect to CycleServer UI
|
||||
Retrieve the CycleServer DNS name from the azure portal and browse to it with https.
|
||||
Retrieve the Cycle admin password from the logs
|
||||
|
||||
```
|
||||
$ grep password azhpc_install_cycle-install-server-managed-identity/install/*.log
|
||||
```
|
||||
|
||||
Connect to the Cycle UI with hpcadmin user and the password retrieved above.
|
||||
|
||||
## Step 4 - Deploy the Cycle CLI
|
||||
Deploy the Cycle CLI locally and on the jumpbox
|
||||
|
||||
```
|
||||
$ azhpc-build --no-vnet -c cycle-cli-local.json
|
||||
$ azhpc-build --no-vnet -c cycle-cli-jumpbox.json
|
||||
```
|
||||
|
||||
## Step 5 - Now deploy the BeeGFS cluster
|
||||
```
|
||||
$ azhpc-build --no-vnet -c beegfs-cluster.json
|
||||
```
|
||||
|
||||
## Step 6 - Create the PBS cluster in CycleCloud
|
||||
|
||||
```
|
||||
$ azhpc ccbuild -c pbscycle.json
|
||||
```
|
||||
|
||||
## Step 7 - Connect to the Active Directory to create users
|
||||
|
||||
```
|
||||
$ azhpc-connect -c ad.json adnode
|
||||
```
|
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
"resource_group": "my resource group",
|
||||
"location": "westeurope",
|
||||
"key_vault": "my key vault",
|
||||
"projectstore": ""
|
||||
}
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"hpc_image": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vm_type": "Standard_HB60rs",
|
||||
"glusterfs_mount_host": "<NOT-SET>",
|
||||
"glusterfs_mount_point": "/glusterfs",
|
||||
"vnet_resource_group": "variables.resource_group"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "hpcvnet",
|
||||
"address_prefix": "10.2.0.0/20",
|
||||
"subnets": {
|
||||
"admin": "10.2.1.0/24",
|
||||
"storage": "10.2.3.0/24",
|
||||
"compute": "10.2.4.0/22"
|
||||
}
|
||||
},
|
||||
"cyclecloud": {
|
||||
"clusters": {
|
||||
"pbscycle": {
|
||||
"template": "pbspro_template_1.3.7",
|
||||
"parameters": {
|
||||
"MaxExecuteCoreCount": 1000,
|
||||
"MasterMachineType": "Standard_D8s_v3",
|
||||
"ExecuteMachineType": "variables.vm_type",
|
||||
"Credentials": "azure",
|
||||
"Autoscale": true,
|
||||
"SubnetId": "{{variables.resource_group}}/hpcvnet/compute",
|
||||
"UseLowPrio": false,
|
||||
"Region": "southcentralus",
|
||||
"ImageName": "variables.hpc_image",
|
||||
"MasterClusterInitSpecs": {
|
||||
"azurehpc:default:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:default:1.0.0",
|
||||
"Spec": "default",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
},
|
||||
"azurehpc:glusterfs:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:glusterfs:1.0.0",
|
||||
"Spec": "glusterfs",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"ExecuteClusterInitSpecs": {
|
||||
"azurehpc:default:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:default:1.0.0",
|
||||
"Spec": "default",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
},
|
||||
"azurehpc:glusterfs:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:glusterfs:1.0.0",
|
||||
"Spec": "glusterfs",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"projects": {
|
||||
"azurehpc:default:1.0.0": [
|
||||
{
|
||||
"script": "cndefault.sh"
|
||||
}
|
||||
],
|
||||
"azurehpc:glusterfs:1.0.0": [
|
||||
{
|
||||
"script": "glusterfs_pkgs.sh"
|
||||
},
|
||||
{
|
||||
"script": "glusterfs_client.sh",
|
||||
"args": [
|
||||
"variables.glusterfs_mount_host",
|
||||
"variables.glusterfs_mount_point"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"resources": {},
|
||||
"install": []
|
||||
}
|
|
@ -0,0 +1,127 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"hpc_image": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"htc_image": "OpenLogic:CentOS:7.7:latest",
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"hpc_vm_type": "Standard_HB60rs",
|
||||
"htc_vm_type": "Standard_D32s_v3",
|
||||
"glusterfs_mount_host": "<NOT-SET>",
|
||||
"glusterfs_mount_point": "/glusterfs",
|
||||
"vnet_resource_group": "variables.resource_group"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "hpcvnet",
|
||||
"address_prefix": "10.2.0.0/20",
|
||||
"subnets": {
|
||||
"admin": "10.2.1.0/24",
|
||||
"storage": "10.2.3.0/24",
|
||||
"compute": "10.2.4.0/22"
|
||||
}
|
||||
},
|
||||
"cyclecloud": {
|
||||
"clusters": {
|
||||
"slurmcycle": {
|
||||
"template": "slurm_template_2.1.0",
|
||||
"parameters": {
|
||||
"MaxHPCExecuteCoreCount": 240,
|
||||
"MaxHTCExecuteCoreCount": 128,
|
||||
"HTCUseLowPrio" : false,
|
||||
"MasterMachineType": "Standard_D8s_v3",
|
||||
"HTCMachineType": "variables.htc_vm_type",
|
||||
"HPCMachineType": "variables.hpc_vm_type",
|
||||
"Credentials": "azure",
|
||||
"Autoscale": true,
|
||||
"SubnetId": "{{variables.resource_group}}/hpcvnet/compute",
|
||||
"UseLowPrio": false,
|
||||
"Region": "southcentralus",
|
||||
"MasterImageName": "variables.htc_image",
|
||||
"HTCImageName": "variables.htc_image",
|
||||
"HPCImageName": "variables.hpc_image",
|
||||
"configuration_slurm_version" : "19.05.5-1",
|
||||
"MasterClusterInitSpecs": {
|
||||
"azurehpc:default:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:default:1.0.0",
|
||||
"Spec": "default",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
},
|
||||
"azurehpc:glusterfs:1.0.0": {
|
||||
"Order": 10100,
|
||||
"Name": "azurehpc:glusterfs:1.0.0",
|
||||
"Spec": "glusterfs",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"HPCClusterInitSpecs": {
|
||||
"azurehpc:default:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:default:1.0.0",
|
||||
"Spec": "default",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
},
|
||||
"azurehpc:glusterfs:1.0.0": {
|
||||
"Order": 10100,
|
||||
"Name": "azurehpc:glusterfs:1.0.0",
|
||||
"Spec": "glusterfs",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"HTCClusterInitSpecs": {
|
||||
"azurehpc:default:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:default:1.0.0",
|
||||
"Spec": "default",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
},
|
||||
"azurehpc:glusterfs:1.0.0": {
|
||||
"Order": 10100,
|
||||
"Name": "azurehpc:glusterfs:1.0.0",
|
||||
"Spec": "glusterfs",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"projects": {
|
||||
"azurehpc:default:1.0.0": [
|
||||
{
|
||||
"script": "cndefault.sh"
|
||||
}
|
||||
],
|
||||
"azurehpc:glusterfs:1.0.0": [
|
||||
{
|
||||
"script": "glusterfs_pkgs.sh"
|
||||
},
|
||||
{
|
||||
"script": "glusterfs_client.sh",
|
||||
"args": [
|
||||
"variables.glusterfs_mount_host",
|
||||
"variables.glusterfs_mount_point"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"resources": {},
|
||||
"install": []
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
# AzureHPC GlusterFS and CycleCloud Integration
|
||||
|
||||
Outlines the procedure to access a GlusterFS PFS deployed by AzureHPC in CycleCloud (PBS or SLURM).
|
||||
|
||||
## Pre-requisites:
|
||||
|
||||
* An installed and setup Azure CycleCloud Application Server (instructions [here](https://docs.microsoft.com/en-us/azure/cyclecloud/quickstart-install-cyclecloud) or using the [azurehpc script](https://github.com/Azure/azurehpc/tree/master/examples/cycleserver))
|
||||
* The Azure CycleCloud CLI (instructions [here](https://docs.microsoft.com/en-us/azure/cyclecloud/install-cyclecloud-cli))
|
||||
* A GlusterFS PFS deployed with AzureHPC ([examples/glusterfs_ephemeral](https://github.com/Azure/azurehpc/tree/master/examples/glusterfs_ephemeral)).
|
||||
|
||||
## Overview of procedure
|
||||
|
||||
The "azhpc ccbuild" command will use a config file to generate AzureHPC projects/Specs and upload them to your default CycleCloud locker. A CycleCloud template parameter file will also be generated based on the parameters you specify in the config file. A default CycleCloud template (PBS or SLURM) (i.e no editing the CC template) will be used to start a CycleCloud cluster using the generated template parameter json file.
|
||||
|
||||
## Initialize the AzureHPC project (e.g for PBS, similar procedure for SLURM using other config file)
|
||||
|
||||
To start you need to copy this directory and update the `config.json`. Azurehpc provides the `azhpc-init` command that can help here by copying the directory and substituting the unset variables. First run with the `-s` parameter to see which variables need to be set:
|
||||
|
||||
```
|
||||
azhpc init -c $azhpc_dir/examples/cc_glusterfs/config_pbscycle.json -d cc_pbs_glusterfs -s
|
||||
```
|
||||
|
||||
The variables can be set with the `-v` option where variables are comma separated. The output from the previous command as a starting point. The `-d` option is required and will create a new directory name for you. Please update to whatever `resource_group` you would like to deploy to:
|
||||
|
||||
```
|
||||
azhpc-init -c $azhpc_dir/examples/cc_glusterfs/config_pbscycle.json -d cc_pbs_glusterfs -v resource_group=azurehpc-cc
|
||||
```
|
||||
|
||||
The glusterfs_mount_host variable may be taken by running ' head -n1 azhpc_install_config/hostlists/glusterfs' in the directory the gluster file system was built in.
|
||||
|
||||
|
||||
## Create CycleCloud Cluster with AzureHPC GlusterFS
|
||||
|
||||
```
|
||||
cd cc_pbs_glusterfs
|
||||
azhpc-build -c config_pbscycle.json --no-vnet
|
||||
```
|
||||
>Note : There is also a conifg file for CC SLURM integration (config_slurmcycle.json)
|
||||
|
||||
## Start CycleCloud Cluster
|
||||
Go to CycleCloud server, find your CycleCloud Cluster (pbscycle or slurmcycle) and click on start.
|
||||
|
||||
## Check that GlusterFS is Mounted on Master and Nodearray resources.
|
||||
|
||||
```
|
||||
df -h
|
||||
```
|
|
@ -0,0 +1,10 @@
|
|||
#!/bin/bash
|
||||
|
||||
azhpc-build -c vnet.json
|
||||
|
||||
blocks="jumpbox.json cycle-prereqs-managed-identity.json cycle-install-server-managed-identity.json cycle-cli-local.json cycle-cli-jumpbox.json gluster-cluster.json"
|
||||
for block in $blocks; do
|
||||
azhpc-build --no-vnet -c $block
|
||||
done
|
||||
|
||||
azhpc ccbuild -c pbscycle.json
|
|
@ -0,0 +1,25 @@
|
|||
#/bin/bash
|
||||
uuid_str="$(cat /proc/sys/kernel/random/uuid | tr -d '\n-' | tr '[:upper:]' '[:lower:]' | cut -c 1-6)"
|
||||
locker="locker$uuid_str"
|
||||
|
||||
jq '.projectstore=$locker' --arg locker $locker variables.json > temp.json
|
||||
cp temp.json variables.json
|
||||
|
||||
function init_config()
|
||||
{
|
||||
local config=$1
|
||||
azhpc-init -d . -c $config
|
||||
config_file=${config##*/}
|
||||
cp $config_file temp.json
|
||||
jq '.variables+=$variables' --argjson variables "$(cat variables.json)" temp.json > $config_file
|
||||
}
|
||||
|
||||
blocks="vnet.json jumpbox.json cycle-prereqs-managed-identity.json cycle-install-server-managed-identity.json cycle-cli-local.json cycle-cli-jumpbox.json gluster-cluster.json"
|
||||
for block in $blocks; do
|
||||
echo "initializing config for $block"
|
||||
init_config $azhpc_dir/experimental/blocks/$block
|
||||
done
|
||||
|
||||
sed -i "s/#projectstore#/$locker/g" cycle-install-server-managed-identity.json
|
||||
init_config $azhpc_dir/experimental/cc_glusterfs2/pbscycle.json
|
||||
init_config $azhpc_dir/experimental/cc_glusterfs2/slurmcycle.json
|
|
@ -0,0 +1,76 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"hpc_image": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"location": "southcentralus",
|
||||
"resource_group": "narcycle",
|
||||
"vm_type": "Standard_HB60rs",
|
||||
"vnet_name": "hpcvnet",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"key_vault": "hpccatkv",
|
||||
"gluster_version": "2.12.4",
|
||||
"projectstore": "locker66fb14"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"cyclecloud": {
|
||||
"clusters": {
|
||||
"pbscycle": {
|
||||
"template": "pbspro_template_1.3.7",
|
||||
"parameters": {
|
||||
"MaxExecuteCoreCount": 1000,
|
||||
"MasterMachineType": "Standard_D8s_v3",
|
||||
"Credentials": "azure",
|
||||
"Autoscale": true,
|
||||
"SubnetId": "{{variables.resource_group}}/{{variables.vnet_name}}/compute",
|
||||
"UseLowPrio": false,
|
||||
"UsePublicNetwork": false,
|
||||
"ReturnProxy": false,
|
||||
"Region": "variables.location",
|
||||
"MasterClusterInitSpecs": {
|
||||
"azurehpc:gluster:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:gluster:1.0.0",
|
||||
"Spec": "gluster",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"ExecuteMachineType": "variables.vm_type",
|
||||
"ImageName": "variables.hpc_image",
|
||||
"ExecuteClusterInitSpecs": {
|
||||
"azurehpc:gluster:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:gluster:1.0.0",
|
||||
"Spec": "gluster",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"projects": {
|
||||
"azurehpc:gluster:1.0.0": [
|
||||
{
|
||||
"script": "glusterfs_pkgs.sh"
|
||||
},
|
||||
{
|
||||
"script": "glusterfs_client.sh",
|
||||
"args": [
|
||||
"glusterfsmaster",
|
||||
"/glusterfs"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"resources": {},
|
||||
"install": []
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
lding the infrastructure
|
||||
Here we will explain how to deploy a full GlusterFS system with a VNET, JUMPBOX, CYCLESERVER and Gluster by using building blocks. These blocks are stored into the experimental/blocks directory.
|
||||
|
||||
## Step 1 - install azhpc
|
||||
after cloning azhpc, source the install.sh script
|
||||
|
||||
## Step 2 - Initialize the configuration files
|
||||
Create a working directory from where you will do the deployment and configuration update. Don't work directly from the cloned repo.
|
||||
|
||||
```
|
||||
$ mkdir cluster
|
||||
$ cd cluster
|
||||
```
|
||||
|
||||
Then copy the init.sh,variables.json, and scripts directory from examples/cc_glusterfs2 to your working directory.
|
||||
|
||||
```
|
||||
$ cp $azhpc_dir/examples/cc_glusterfs2/init.sh .
|
||||
$ cp $azhpc_dir/examples/cc_glusterfs2/variables.json .
|
||||
$ cp -r $azhpc_dir/examples/cc_glusterfs2/scripts .
|
||||
```
|
||||
|
||||
Edit the variables.json to match your environment. Leave the projectstore empty as it will be filled up with a random value by the init script. An existing keyvault should be referenced as it won't be created for you.
|
||||
|
||||
```json
|
||||
{
|
||||
"resource_group": "my resource group",
|
||||
"location": "location",
|
||||
"key_vault": "my key vault",
|
||||
"projectstore": ""
|
||||
}
|
||||
```
|
||||
|
||||
Run the init.sh script which will copy all the config files of the building blocks and initialize the variables by using the variables.json updated above.
|
||||
|
||||
```
|
||||
$ ./init.sh
|
||||
```
|
||||
|
||||
## Step 2 - Build the system
|
||||
|
||||
```
|
||||
$ azhpc-build -c vnet.json
|
||||
$ azhpc-build --no-vnet -c jumpbox.json
|
||||
$ azhpc-build --no-vnet -c cycle-prereqs-managed-identity.json
|
||||
$ azhpc-build --no-vnet -c cycle-install-server-managed-identity.json
|
||||
```
|
||||
|
||||
## Step 3 - Deploy the Cycle CLI
|
||||
Deploy the Cycle CLI locally and on the jumpbox
|
||||
|
||||
```
|
||||
$ azhpc-build --no-vnet -c cycle-cli-local.json
|
||||
$ azhpc-build --no-vnet -c cycle-cli-jumpbox.json
|
||||
```
|
||||
|
||||
## Step 4 - Now deploy the GlusterFS cluster
|
||||
```
|
||||
$ azhpc-build --no-vnet -c gluster-cluster.json
|
||||
```
|
||||
|
||||
## Step 5 - Create the PBS cluster in CycleCloud
|
||||
|
||||
```
|
||||
$ azhpc build -c pbscycle.json --no-vnet
|
||||
```
|
||||
|
||||
## Step 6 - Connect to CycleServer UI
|
||||
Retrieve the CycleServer DNS name by connecting with azhpc
|
||||
|
||||
```
|
||||
$ azhpc-connect -c cycle-install-server-managed-identity.json cycleserver
|
||||
[2020-06-10 08:28:04] logging directly into cycleserver559036.westeurope.cloudapp.azure.com
|
||||
$ [hpcadmin@cycleserver ~]$ exit
|
||||
```
|
||||
|
||||
Retrieve the Cycle admin password from the logs
|
||||
|
||||
```
|
||||
$ grep password azhpc_install_cycle-install-server-managed-identity/install/*.log
|
||||
```
|
||||
|
||||
Connect to the Cycle UI with hpcadmin user and the password retrieved above. Check that you have a pbscycle cluster ready and start it
|
||||
# AzureHPC BeeGFS and CycleCloud Integration
|
|
@ -0,0 +1,4 @@
|
|||
#/bin/bash
|
||||
|
||||
systemctl start glusterd
|
||||
systemctl status glusterd
|
|
@ -0,0 +1,11 @@
|
|||
#/bin/bash
|
||||
|
||||
HOSTLIST=$1
|
||||
|
||||
systemctl start glusterd
|
||||
systemctl status glusterd
|
||||
|
||||
for host in $HOSTLIST
|
||||
do
|
||||
gluster peer probe $host
|
||||
done
|
|
@ -0,0 +1,91 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"hpc_image": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"hpc_vm_type": "Standard_HB60rs",
|
||||
"htc_vm_type": "Standard_D16s_v3",
|
||||
"location": "southcentralus",
|
||||
"resource_group": "narcycle1",
|
||||
"vm_type": "Standard_HB60rs",
|
||||
"vnet_name": "hpcvnet",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"key_vault": "hpccatkv",
|
||||
"projectstore": "locker1a5c9c"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"cyclecloud": {
|
||||
"clusters": {
|
||||
"slurmcycle": {
|
||||
"template": "slurm_template_2.1.0",
|
||||
"parameters": {
|
||||
"MaxHPCExecuteCoreCount": 240,
|
||||
"MaxHTCExecuteCoreCount": 128,
|
||||
"HTCUseLowPrio": false,
|
||||
"MasterMachineType": "Standard_D8s_v3",
|
||||
"HTCMachineType": "variables.htc_vm_type",
|
||||
"HPCMachineType": "variables.hpc_vm_type",
|
||||
"MasterImageName": "variables.htc_image",
|
||||
"HTCImageName": "variables.htc_image",
|
||||
"HPCImageName": "variables.hpc_image",
|
||||
"Credentials": "azure",
|
||||
"Autoscale": true,
|
||||
"SubnetId": "{{variables.resource_group}}/hpcvnet/compute",
|
||||
"UseLowPrio": false,
|
||||
"Region": "{{variables.location}}",
|
||||
"configuration_slurm_version": "19.05.5-1",
|
||||
"MasterClusterInitSpecs": {
|
||||
"azurehpc:lustre:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:lustre:1.0.0",
|
||||
"Spec": "lustre",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"HPCClusterInitSpecs": {
|
||||
"azurehpc:lustre:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:lustre:1.0.0",
|
||||
"Spec": "lustre",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"HTCClusterInitSpecs": {
|
||||
"azurehpc:lustre:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:lustre:1.0.0",
|
||||
"Spec": "lustre",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"projects": {
|
||||
"azurehpc:lustre:1.0.0": [
|
||||
{
|
||||
"script": "glusterfs_pkgs.sh"
|
||||
},
|
||||
{
|
||||
"script": "glusterfs_client.sh",
|
||||
"args": [
|
||||
"glusterfsmaster",
|
||||
"/glusterfs"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"resources": {},
|
||||
"install": []
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"resource_group": "<NOT-SET>",
|
||||
"location": "<NOT-SET>",
|
||||
"key_vault": "<NOT-SET>",
|
||||
"projectstore": ""
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
#!/bin/bash
|
||||
|
||||
azhpc-build -c vnet.json
|
||||
|
||||
blocks="jumpbox.json cycle-prereqs-managed-identity.json cycle-install-server-managed-identity.json cycle-cli-local.json cycle-cli-jumpbox.json lustre-cluster.json"
|
||||
for block in $blocks; do
|
||||
azhpc-build --no-vnet -c $block
|
||||
done
|
||||
|
||||
azhpc ccbuild -c pbscycle.json
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
#/bin/bash
|
||||
uuid_str="$(cat /proc/sys/kernel/random/uuid | tr -d '\n-' | tr '[:upper:]' '[:lower:]' | cut -c 1-6)"
|
||||
locker="locker$uuid_str"
|
||||
|
||||
jq '.projectstore=$locker' --arg locker $locker variables.json > temp.json
|
||||
cp temp.json variables.json
|
||||
|
||||
function init_config()
|
||||
{
|
||||
local config=$1
|
||||
azhpc-init -d . -c $config
|
||||
config_file=${config##*/}
|
||||
cp $config_file temp.json
|
||||
jq '.variables+=$variables' --argjson variables "$(cat variables.json)" temp.json > $config_file
|
||||
}
|
||||
|
||||
blocks="vnet.json jumpbox.json cycle-prereqs-managed-identity.json cycle-install-server-managed-identity.json cycle-cli-local.json cycle-cli-jumpbox.json lustre-cluster.json"
|
||||
for block in $blocks; do
|
||||
echo "initializing config for $block"
|
||||
init_config $azhpc_dir/experimental/blocks/$block
|
||||
done
|
||||
|
||||
sed -i "s/#projectstore#/$locker/g" cycle-install-server-managed-identity.json
|
||||
init_config $azhpc_dir/examples/cc_lustre/pbscycle.json
|
||||
init_config $azhpc_dir/examples/cc_lustre/slurmcycle.json
|
|
@ -0,0 +1,105 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "variables.jumpbox",
|
||||
"admin_user": "hpcadmin",
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"variables": {
|
||||
"resource_group": "<NOT-SET>",
|
||||
"location": "<NOT-SET>",
|
||||
"vnet_name": "hpcvnet",
|
||||
"jumpbox": "fqdn.jumpbox",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"lustre_image": "OpenLogic:CentOS:7.7:latest",
|
||||
"lustre_vm_type": "Standard_L16s_v2",
|
||||
"lustre_instances": 2,
|
||||
"lustre_storage_account": "<NOT-SET>",
|
||||
"lustre_storage_container": "hsm",
|
||||
"lustre_version": "2.12.4",
|
||||
"lustre_mount": "/lustre",
|
||||
"key_vault": "<NOT-SET>",
|
||||
"projectstore": "<NOT-SET>"
|
||||
},
|
||||
"resources": {
|
||||
"lustre": {
|
||||
"type": "vmss",
|
||||
"vm_type": "variables.lustre_vm_type",
|
||||
"instances": "variables.lustre_instances",
|
||||
"accelerated_networking": true,
|
||||
"image": "variables.lustre_image",
|
||||
"subnet": "storage",
|
||||
"tags": [
|
||||
"cndefault",
|
||||
"lustre",
|
||||
"lfsrepo",
|
||||
"disable-selinux"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "disable-selinux.sh",
|
||||
"tag": "disable-selinux",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "cndefault.sh",
|
||||
"tag": "cndefault",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "create_raid0.sh",
|
||||
"args": [
|
||||
"/dev/md10",
|
||||
"/dev/nvme*n1"
|
||||
],
|
||||
"tag": "lustre",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "lfsrepo.sh",
|
||||
"tag": "lfsrepo",
|
||||
"args": [
|
||||
"variables.lustre_version"
|
||||
],
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "lfspkgs.sh",
|
||||
"tag": "lustre",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "lfsmaster.sh",
|
||||
"tag": "lustre",
|
||||
"args": [
|
||||
"/dev/sdb"
|
||||
],
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "lfsoss.sh",
|
||||
"args": [
|
||||
"$(head -n1 hostlists/tags/lustre)",
|
||||
"/dev/md10"
|
||||
],
|
||||
"tag": "lustre",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "lfshsm.sh",
|
||||
"args": [
|
||||
"$(head -n1 hostlists/tags/lustre)",
|
||||
"variables.lustre_storage_account",
|
||||
"sakey.{{variables.lustre_storage_account}}",
|
||||
"variables.lustre_storage_container",
|
||||
"variables.lustre_version"
|
||||
],
|
||||
"tag": "lustre",
|
||||
"sudo": true
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"hpc_image": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"location": "southcentralus",
|
||||
"resource_group": "narcycle1",
|
||||
"vm_type": "Standard_HB60rs",
|
||||
"vnet_name": "hpcvnet",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"key_vault": "hpccatkv",
|
||||
"lustre_version": "2.12.4",
|
||||
"projectstore": "locker1a5c9c"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"cyclecloud": {
|
||||
"clusters": {
|
||||
"pbscycle": {
|
||||
"template": "pbspro_template_1.3.7",
|
||||
"parameters": {
|
||||
"MaxExecuteCoreCount": 1000,
|
||||
"MasterMachineType": "Standard_D8s_v3",
|
||||
"Credentials": "azure",
|
||||
"Autoscale": true,
|
||||
"SubnetId": "{{variables.resource_group}}/{{variables.vnet_name}}/compute",
|
||||
"UseLowPrio": false,
|
||||
"UsePublicNetwork": false,
|
||||
"ReturnProxy": false,
|
||||
"Region": "variables.location",
|
||||
"MasterClusterInitSpecs": {
|
||||
"azurehpc:lustre:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:lustre:1.0.0",
|
||||
"Spec": "lustre",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"ExecuteMachineType": "variables.vm_type",
|
||||
"ImageName": "variables.hpc_image",
|
||||
"ExecuteClusterInitSpecs": {
|
||||
"azurehpc:lustre:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:lustre:1.0.0",
|
||||
"Spec": "lustre",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"projects": {
|
||||
"azurehpc:lustre:1.0.0": [
|
||||
{
|
||||
"script": "lfsrepo.sh",
|
||||
"args": [
|
||||
"variables.lustre_version"
|
||||
]
|
||||
},
|
||||
{
|
||||
"script": "lfspkgs.sh"
|
||||
},
|
||||
{
|
||||
"script": "lfsclient.sh",
|
||||
"args": [
|
||||
"lustremaster"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"resources": {},
|
||||
"install": []
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
# Building the infrastructure
|
||||
Here we will explain how to deploy a full Lustre system with a VNET, JUMPBOX, CYCLESERVER and BEEGFS by using building blocks. These blocks are stored into the experimental/blocks directory.
|
||||
|
||||
## Step 1 - install azhpc
|
||||
after cloning azhpc, source the install.sh script
|
||||
|
||||
## Step 2 - Initialize the configuration files
|
||||
Create a working directory from where you will do the deployment and configuration update. Don't work directly from the cloned repo.
|
||||
|
||||
```
|
||||
$ mkdir cluster
|
||||
$ cd cluster
|
||||
```
|
||||
|
||||
Then copy the init.sh and variables.json from examples/cc_lustre to your working directory.
|
||||
|
||||
```
|
||||
$ cp $azhpc_dir/examples/cc_lustre/init.sh .
|
||||
$ cp $azhpc_dir/examples/cc_lustre/variables.json .
|
||||
```
|
||||
|
||||
Edit the variables.json to match your environment. Leave the projectstore empty as it will be filled up with a random value by the init script. An existing keyvault should be referenced as it won't be created for you.
|
||||
|
||||
```json
|
||||
{
|
||||
"resource_group": "my resource group",
|
||||
"location": "location",
|
||||
"key_vault": "my key vault",
|
||||
"projectstore": ""
|
||||
}
|
||||
```
|
||||
|
||||
Run the init.sh script which will copy all the config files of the building blocks and initialize the variables by using the variables.json updated above.
|
||||
|
||||
```
|
||||
$ ./init.sh
|
||||
```
|
||||
|
||||
## Step 2 - Build the system
|
||||
|
||||
```
|
||||
$ azhpc-build -c vnet.json
|
||||
$ azhpc-build --no-vnet -c jumpbox.json
|
||||
$ azhpc-build --no-vnet -c cycle-prereqs-managed-identity.json
|
||||
$ azhpc-build --no-vnet -c cycle-install-server-managed-identity.json
|
||||
```
|
||||
|
||||
## Step 3 - Deploy the Cycle CLI
|
||||
Deploy the Cycle CLI locally and on the jumpbox
|
||||
|
||||
```
|
||||
$ azhpc-build --no-vnet -c cycle-cli-local.json
|
||||
$ azhpc-build --no-vnet -c cycle-cli-jumpbox.json
|
||||
```
|
||||
|
||||
## Step 4 - Now deploy the Lustre cluster
|
||||
```
|
||||
$ azhpc-build --no-vnet -c lustre-cluster.json
|
||||
```
|
||||
|
||||
## Step 5 - Create the PBS cluster in CycleCloud
|
||||
|
||||
```
|
||||
$ azhpc build -c pbscycle.json --no-vnet
|
||||
```
|
||||
|
||||
## Step 6 - Connect to CycleServer UI
|
||||
Retrieve the CycleServer DNS name by connecting with azhpc
|
||||
|
||||
```
|
||||
$ azhpc-connect -c cycle-install-server-managed-identity.json cycleserver
|
||||
[2020-06-10 08:28:04] logging directly into cycleserver559036.westeurope.cloudapp.azure.com
|
||||
$ [hpcadmin@cycleserver ~]$ exit
|
||||
```
|
||||
|
||||
Retrieve the Cycle admin password from the logs
|
||||
|
||||
```
|
||||
$ grep password azhpc_install_cycle-install-server-managed-identity/install/*.log
|
||||
```
|
||||
|
||||
Connect to the Cycle UI with hpcadmin user and the password retrieved above. Check that you have a pbscycle cluster ready and start it
|
|
@ -0,0 +1,88 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": {
|
||||
"hpc_image": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"hpc_vm_type": "Standard_HB60rs",
|
||||
"htc_vm_type": "Standard_D16s_v3",
|
||||
"location": "<NOT-SET>",
|
||||
"resource_group": "<NOT-SET>",
|
||||
"vm_type": "Standard_HB60rs",
|
||||
"vnet_name": "hpcvnet",
|
||||
"vnet_resource_group": "variables.resource_group"
|
||||
},
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "variables.vnet_name"
|
||||
},
|
||||
"cyclecloud": {
|
||||
"clusters": {
|
||||
"slurmcycle": {
|
||||
"template": "slurm_template_2.1.0",
|
||||
"parameters": {
|
||||
"MaxHPCExecuteCoreCount": 240,
|
||||
"MaxHTCExecuteCoreCount": 128,
|
||||
"HTCUseLowPrio" : false,
|
||||
"MasterMachineType": "Standard_D8s_v3",
|
||||
"HTCMachineType": "variables.htc_vm_type",
|
||||
"HPCMachineType": "variables.hpc_vm_type",
|
||||
"MasterImageName": "variables.htc_image",
|
||||
"HTCImageName": "variables.htc_image",
|
||||
"HPCImageName": "variables.hpc_image",
|
||||
"Credentials": "azure",
|
||||
"Autoscale": true,
|
||||
"SubnetId": "{{variables.resource_group}}/hpcvnet/compute",
|
||||
"UseLowPrio": false,
|
||||
"Region": "{{variables.location}}",
|
||||
"configuration_slurm_version" : "19.05.5-1",
|
||||
"MasterClusterInitSpecs": {
|
||||
"azurehpc:lustre:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:lustre:1.0.0",
|
||||
"Spec": "lustre",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"HPCClusterInitSpecs": {
|
||||
"azurehpc:lustre:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:lustre:1.0.0",
|
||||
"Spec": "lustre",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"HTCClusterInitSpecs": {
|
||||
"azurehpc:lustre:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:lustre:1.0.0",
|
||||
"Spec": "lustre",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"projects": {
|
||||
"azurehpc:lustre:1.0.0": [
|
||||
{
|
||||
"script": "lfspkgs.sh"
|
||||
},
|
||||
{
|
||||
"script": "lfsclient.sh",
|
||||
"args": [
|
||||
"headnode"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"resources": {},
|
||||
"install": []
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
"resource_group": "my resource group",
|
||||
"location": "my location",
|
||||
"key_vault": "my key vault",
|
||||
"projectstore": ""
|
||||
}
|
||||
|
|
@ -0,0 +1,155 @@
|
|||
{
|
||||
"location": "variables.location",
|
||||
"resource_group": "variables.resource_group",
|
||||
"install_from": "jumpbox",
|
||||
"admin_user": "hpcadmin",
|
||||
"variables": "@variables.json",
|
||||
"vnet": {
|
||||
"resource_group": "variables.vnet_resource_group",
|
||||
"name": "hpcvnet",
|
||||
"address_prefix": "10.2.0.0/20",
|
||||
"subnets": {
|
||||
"admin": "10.2.1.0/24",
|
||||
"viz": "10.2.2.0/24",
|
||||
"compute": "10.2.4.0/22"
|
||||
}
|
||||
},
|
||||
"cyclecloud": {
|
||||
"clusters": {
|
||||
"pbscycle": {
|
||||
"template": "pbspro_template_1.3.7",
|
||||
"parameters": {
|
||||
"MaxExecuteCoreCount": 1000,
|
||||
"MasterMachineType": "Standard_D8s_v3",
|
||||
"Credentials": "azure",
|
||||
"Autoscale": true,
|
||||
"SubnetId": "{{variables.resource_group}}/hpcvnet/compute",
|
||||
"UseLowPrio": false,
|
||||
"Region": "westeurope",
|
||||
"MasterClusterInitSpecs": {
|
||||
"azurehpc:beegfs:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:beegfs:1.0.0",
|
||||
"Spec": "beegfs",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
},
|
||||
"ExecuteMachineType": "variables.vm_type",
|
||||
"ImageName": "variables.hpc_image",
|
||||
"ExecuteClusterInitSpecs": {
|
||||
"azurehpc:beegfs:1.0.0": {
|
||||
"Order": 10000,
|
||||
"Name": "azurehpc:beegfs:1.0.0",
|
||||
"Spec": "beegfs",
|
||||
"Project": "azurehpc",
|
||||
"Version": "1.0.0",
|
||||
"Locker": "azure-storage"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"projects": {
|
||||
"azurehpc:beegfs:1.0.0": [
|
||||
{
|
||||
"script": "beegfspkgs.sh"
|
||||
},
|
||||
{
|
||||
"script": "beegfsc.sh",
|
||||
"args": [ "beegfsm" ]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"resources": {
|
||||
"jumpbox": {
|
||||
"type": "vm",
|
||||
"vm_type": "Standard_D4s_v3",
|
||||
"accelerated_networking": true,
|
||||
"public_ip": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "compute",
|
||||
"tags": [ ]
|
||||
},
|
||||
"beegfsm": {
|
||||
"type": "vm",
|
||||
"vm_type": "Standard_D4s_v3",
|
||||
"accelerated_networking": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "compute",
|
||||
"tags": [
|
||||
"beegfspkgs",
|
||||
"beegfsm",
|
||||
"disable-selinux",
|
||||
"beegfsc"
|
||||
]
|
||||
},
|
||||
"beegfssm": {
|
||||
"type": "vmss",
|
||||
"vm_type": "Standard_L8s_v2",
|
||||
"instances": 2,
|
||||
"accelerated_networking": true,
|
||||
"image": "variables.image",
|
||||
"subnet": "compute",
|
||||
"tags": [
|
||||
"beegfspkgs",
|
||||
"beegfssd",
|
||||
"beegfsmd",
|
||||
"disable-selinux"
|
||||
]
|
||||
}
|
||||
},
|
||||
"install": [
|
||||
{
|
||||
"script": "disable-selinux.sh",
|
||||
"tag": "disable-selinux",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfspkgs.sh",
|
||||
"tag": "beegfspkgs",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfsm.sh",
|
||||
"args": [
|
||||
"/data/beegfs/mgmt"
|
||||
],
|
||||
"tag": "beegfsm",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfssd.sh",
|
||||
"args": [
|
||||
"variables.beegfs_disk_type",
|
||||
"variables.beegfs_node_type",
|
||||
"variables.beegfs_pools",
|
||||
"variables.beegfs_pools_restart",
|
||||
"$(<hostlists/tags/beegfsm)"
|
||||
],
|
||||
"tag": "beegfssd",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfsmd.sh",
|
||||
"args": [
|
||||
"variables.beegfs_disk_type",
|
||||
"variables.beegfs_node_type",
|
||||
"variables.beegfs_pools",
|
||||
"$(<hostlists/tags/beegfsm)"
|
||||
],
|
||||
"tag": "beegfsmd",
|
||||
"sudo": true
|
||||
},
|
||||
{
|
||||
"script": "beegfsc.sh",
|
||||
"args": [
|
||||
"$(<hostlists/tags/beegfsm)"
|
||||
],
|
||||
"tag": "beegfsc",
|
||||
"sudo": true
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
## Tests
|
||||
|
||||
These tests should all run even if launched from a different working directory (opened files are relative to the config file that is passed).
|
||||
|
||||
# simple test to read a variable
|
||||
|
||||
azhpc get variables.hpc_image
|
||||
|
||||
# this expects the variable substitution in the string
|
||||
|
||||
azhpc get cyclecloud.clusters.pbscycle.parameters.SubnetId
|
||||
|
||||
# this should show the variables section in the output
|
||||
|
||||
azhpc preprocess
|
|
@ -0,0 +1,13 @@
|
|||
{
|
||||
"image": "OpenLogic:CentOS:7.7:latest",
|
||||
"hpc_image": "OpenLogic:CentOS-HPC:7.7:latest",
|
||||
"location": "westeurope",
|
||||
"resource_group": "paul-test-cc-beegfs",
|
||||
"vm_type": "Standard_HB60rs",
|
||||
"vnet_resource_group": "variables.resource_group",
|
||||
"beegfs_disk_type": "nvme",
|
||||
"beegfs_node_type": "both",
|
||||
"beegfs_pools": "false",
|
||||
"beegfs_pools_restart": "false",
|
||||
"foo": "<NOT-SET>"
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
#/bin/bash
|
||||
config_list="$1"
|
||||
AZHPC_CONFIG=$2
|
||||
AZHPC_VARIABLES=$3
|
||||
|
||||
if [ ! -e $AZHPC_CONFIG ]; then
|
||||
echo "destination config file '$AZHPC_CONFIG' is missing"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -e $AZHPC_VARIABLES ]; then
|
||||
echo "input variables file '$AZHPC_VARIABLES' is missing"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
function copy_and_merge_config()
|
||||
{
|
||||
local config=$1
|
||||
cp $config .
|
||||
config_file=${config##*/}
|
||||
|
||||
# Merge config files
|
||||
cp $AZHPC_CONFIG temp.json
|
||||
jq -s '.[0] * .[1]' temp.json $config_file > $AZHPC_CONFIG
|
||||
}
|
||||
|
||||
for config in $config_list; do
|
||||
echo "initializing config for $config"
|
||||
copy_and_merge_config $config
|
||||
done
|
||||
|
||||
# Concatenate install array into a single one
|
||||
install_list=$(jq -s '[.[].install[]]' $config_list)
|
||||
|
||||
# Replace the install array into the final config file
|
||||
jq '.install=$items' --argjson items "$install_list" $AZHPC_CONFIG > temp.json
|
||||
cp temp.json $AZHPC_CONFIG
|
||||
|
||||
# Merge variables file into config file
|
||||
cp $AZHPC_CONFIG temp.json
|
||||
jq '.variables+=$variables' --argjson variables "$(jq '.variables' $AZHPC_VARIABLES)" temp.json > $AZHPC_CONFIG
|
||||
|
||||
rm temp.json
|
|
@ -305,6 +305,38 @@ class ArmTemplate:
|
|||
]
|
||||
self.resources.append(netapp_volume)
|
||||
|
||||
def _add_storageaccount(self, cfg, name):
|
||||
loc = cfg["location"]
|
||||
|
||||
res = {
|
||||
"type": "Microsoft.Storage/storageAccounts",
|
||||
"apiVersion": "2019-06-01",
|
||||
"name": name,
|
||||
"location": loc,
|
||||
"sku": {
|
||||
"name": "Standard_LRS"
|
||||
},
|
||||
"kind": "StorageV2",
|
||||
"properties": {
|
||||
"accessTier": "Hot"
|
||||
},
|
||||
"resources": []
|
||||
}
|
||||
|
||||
for container in cfg["storage"][name].get("containers", []):
|
||||
res["resources"].append(
|
||||
{
|
||||
"type": "blobServices/containers",
|
||||
"apiVersion": "2019-06-01",
|
||||
"name": f"default/{container}",
|
||||
"dependsOn": [
|
||||
name
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
self.resources.append(res)
|
||||
|
||||
def _add_proximity_group(self, cfg):
|
||||
ppg = cfg.get("proximity_placement_group_name", None)
|
||||
if ppg:
|
||||
|
@ -816,6 +848,8 @@ class ArmTemplate:
|
|||
stype = cfg["storage"][s]["type"]
|
||||
if stype == "anf":
|
||||
self._add_netapp(cfg, s, deploy_network)
|
||||
elif stype == "storageaccount":
|
||||
self._add_storageaccount(cfg, s)
|
||||
else:
|
||||
log.error("unrecognised storage type ({}) for {}".format(stype, s))
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
@ -9,11 +10,15 @@ log = azlog.getLogger(__name__)
|
|||
|
||||
class ConfigFile:
|
||||
def __init__(self):
|
||||
self.file_location = '.'
|
||||
self.data = {}
|
||||
self.regex = re.compile(r'({{([^{}]*)}})')
|
||||
|
||||
def open(self, fname):
|
||||
log.debug("opening "+fname)
|
||||
self.file_location = os.path.dirname(fname)
|
||||
if self.file_location == "":
|
||||
self.file_location = "."
|
||||
with open(fname) as f:
|
||||
self.data = json.load(f)
|
||||
|
||||
|
@ -29,41 +34,35 @@ class ConfigFile:
|
|||
dest = azutil.get_fqdn(self.read_value("resource_group"), f"{install_from}_pip")
|
||||
log.debug(f"install_from destination : {dest}")
|
||||
return dest
|
||||
|
||||
def get_unset_vars(self):
|
||||
return [
|
||||
x
|
||||
for x in self.data.get("variables", {}).keys()
|
||||
if self.data["variables"][x] == "<NOT-SET>"
|
||||
]
|
||||
|
||||
def replace_vars(self, vdict):
|
||||
if "variables" in self.data:
|
||||
for v in vdict.keys():
|
||||
if v in self.data["variables"]:
|
||||
self.data["variables"][v] = vdict[v]
|
||||
|
||||
def __evaluate_dict(self, x):
|
||||
|
||||
def __evaluate_dict(self, x, extended):
|
||||
ret = {}
|
||||
for k in x.keys():
|
||||
ret[k] = self.__evaluate(x[k])
|
||||
ret[k] = self.__evaluate(x[k], extended)
|
||||
return ret
|
||||
|
||||
def __evaluate_list(self, x):
|
||||
return [ self.__evaluate(v) for v in x ]
|
||||
def __evaluate_list(self, x, extended):
|
||||
return [ self.__evaluate(v, extended) for v in x ]
|
||||
|
||||
def __evaluate(self, input):
|
||||
def __evaluate(self, input, extended=True):
|
||||
if type(input) == dict:
|
||||
return self.__evaluate_dict(input)
|
||||
return self.__evaluate_dict(input, extended)
|
||||
elif type(input) == list:
|
||||
return self.__evaluate_list(input)
|
||||
return self.__evaluate_list(input, extended)
|
||||
elif type(input) == str:
|
||||
return self.__process_value(input)
|
||||
fname = self.file_location + "/" + input[1:]
|
||||
if input.startswith("@") and os.path.isfile(fname):
|
||||
log.debug(f"loading include {fname}")
|
||||
with open(fname) as f:
|
||||
input = json.load(f)
|
||||
return self.__evaluate_dict(input, extended)
|
||||
else:
|
||||
return self.__process_value(input, extended)
|
||||
else:
|
||||
return input
|
||||
|
||||
def preprocess(self):
|
||||
res = self.__evaluate(self.data)
|
||||
def preprocess(self, extended=True):
|
||||
res = self.__evaluate(self.data, extended)
|
||||
return res
|
||||
|
||||
def read_keys(self, v):
|
||||
|
@ -90,6 +89,14 @@ class ConfigFile:
|
|||
try:
|
||||
it = self.data
|
||||
for x in v.split('.'):
|
||||
if type(it) is str:
|
||||
fname = self.file_location + "/" + it[1:]
|
||||
if it.startswith("@") and os.path.isfile(fname):
|
||||
log.debug(f"loading include {fname}")
|
||||
with open(fname) as f:
|
||||
it = json.load(f)
|
||||
else:
|
||||
log.error("invalid path in config file ({v})")
|
||||
it = it[x]
|
||||
|
||||
if type(it) is str:
|
||||
|
@ -104,13 +111,13 @@ class ConfigFile:
|
|||
|
||||
return res
|
||||
|
||||
def __process_value(self, v):
|
||||
log.debug("process_value (enter): "+str(v))
|
||||
def __process_value(self, v, extended=True):
|
||||
log.debug(f"process_value (enter): {v} [extended={extended}]")
|
||||
|
||||
def repl(match):
|
||||
return str(self.__process_value(match.group()[2:-2]))
|
||||
return str(self.__process_value(match.group()[2:-2], extended))
|
||||
|
||||
v = self.regex.sub(lambda m: str(self.__process_value(m.group()[2:-2])), v)
|
||||
v = self.regex.sub(lambda m: str(self.__process_value(m.group()[2:-2], extended)), v)
|
||||
|
||||
parts = v.split('.')
|
||||
prefix = parts[0]
|
||||
|
@ -119,7 +126,7 @@ class ConfigFile:
|
|||
res = self.read_value(v)
|
||||
elif prefix == "secret":
|
||||
res = azutil.get_keyvault_secret(parts[1], parts[2])
|
||||
elif prefix == "sasurl":
|
||||
elif extended and prefix == "sasurl":
|
||||
log.debug(parts)
|
||||
url = azutil.get_storage_url(parts[1])
|
||||
x = parts[-1].split(",")
|
||||
|
@ -133,21 +140,21 @@ class ConfigFile:
|
|||
log.debug(parts)
|
||||
path = ".".join(parts[2:])
|
||||
res = f"{url}{path}?{saskey}"
|
||||
elif prefix == "fqdn":
|
||||
elif extended and prefix == "fqdn":
|
||||
res = azutil.get_fqdn(self.read_value("resource_group"), parts[1]+"_pip")
|
||||
elif prefix == "sakey":
|
||||
elif extended and prefix == "sakey":
|
||||
res = azutil.get_storage_key(parts[1])
|
||||
elif prefix == "saskey":
|
||||
elif extended and prefix == "saskey":
|
||||
x = parts[2].split(",")
|
||||
if len(x) == 1:
|
||||
x.append("r")
|
||||
container = x[0].split('/')[0]
|
||||
res = azutil.get_storage_saskey(parts[1], container, x[1])
|
||||
elif prefix == "laworkspace":
|
||||
elif extended and prefix == "laworkspace":
|
||||
res = azutil.get_log_analytics_workspace(parts[1], parts[2])
|
||||
elif prefix == "lakey":
|
||||
elif extended and prefix == "lakey":
|
||||
res = azutil.get_log_analytics_key(parts[1], parts[2])
|
||||
elif prefix == "acrkey":
|
||||
elif extended and prefix == "acrkey":
|
||||
res = azutil.get_acr_key(parts[1])
|
||||
else:
|
||||
res = v
|
||||
|
|
|
@ -34,9 +34,49 @@ def do_get(args):
|
|||
|
||||
def __add_unset_vars(vset, config_file):
|
||||
log.debug(f"looking for vars in {config_file}")
|
||||
config = azconfig.ConfigFile()
|
||||
config.open(config_file)
|
||||
vset.update(config.get_unset_vars())
|
||||
file_loc = os.path.dirname(config_file)
|
||||
with open(config_file) as f:
|
||||
data = json.load(f)
|
||||
vars = data.get("variables", {})
|
||||
if type(vars) is str and vars.startswith("@"):
|
||||
fname = vars[1:]
|
||||
if file_loc == "":
|
||||
file_loc = "."
|
||||
log.debug(f"variables are redirected to {file_loc}/{fname}")
|
||||
with open(f"{file_loc}/{fname}") as f:
|
||||
vars = json.load(f)
|
||||
log.debug(vars)
|
||||
vset.update([
|
||||
x
|
||||
for x in vars.keys()
|
||||
if vars[x] == "<NOT-SET>"
|
||||
])
|
||||
|
||||
def __replace_vars(vset, config_file):
|
||||
log.debug(f"replacing vars in {config_file}")
|
||||
floc = os.path.dirname(config_file)
|
||||
fname = config_file
|
||||
with open(fname) as f:
|
||||
alldata = json.load(f)
|
||||
varsobj = alldata.get("variables", {})
|
||||
|
||||
if type(varsobj) is str and varsobj.startswith("@"):
|
||||
if floc != "":
|
||||
floc = floc + "/"
|
||||
fname = floc + varsobj[1:]
|
||||
log.debug(f"variables are redirected to {fname}")
|
||||
with open(f"{fname}") as f:
|
||||
alldata = json.load(f)
|
||||
varsobj = alldata
|
||||
|
||||
log.debug("replacing variables")
|
||||
for v in vset.keys():
|
||||
if v in varsobj:
|
||||
varsobj[v] = vset[v]
|
||||
|
||||
log.debug("saving file ({fname})")
|
||||
with open(fname, "w") as f:
|
||||
json.dump(alldata, f, indent=4)
|
||||
|
||||
def do_init(args):
|
||||
if not os.path.exists(args.config_file):
|
||||
|
@ -98,10 +138,8 @@ def do_init(args):
|
|||
for root, dirs, files in os.walk(args.dir):
|
||||
for name in files:
|
||||
if os.path.splitext(name)[1] == ".json":
|
||||
config = azconfig.ConfigFile()
|
||||
config.open(os.path.join(root, name))
|
||||
config.replace_vars(vset)
|
||||
config.save(os.path.join(root, name))
|
||||
fname = os.path.join(root, name)
|
||||
__replace_vars(vset, os.path.join(root, name))
|
||||
|
||||
def do_scp(args):
|
||||
log.debug("reading config file ({})".format(args.config_file))
|
||||
|
@ -552,7 +590,9 @@ def do_build(args):
|
|||
|
||||
c = azconfig.ConfigFile()
|
||||
c.open(args.config_file)
|
||||
config = c.preprocess()
|
||||
log.debug("About to preprocess")
|
||||
config = c.preprocess(extended=False)
|
||||
log.debug("Preprocessed")
|
||||
|
||||
adminuser = config["admin_user"]
|
||||
private_key_file = adminuser+"_id_rsa"
|
||||
|
@ -594,6 +634,9 @@ def do_build(args):
|
|||
|
||||
_wait_for_deployment(config["resource_group"], deployname)
|
||||
|
||||
log.info("re-evaluating the config")
|
||||
config = c.preprocess()
|
||||
|
||||
log.info("building host lists")
|
||||
azinstall.generate_hostlists(config, tmpdir)
|
||||
log.info("building install scripts")
|
||||
|
@ -604,6 +647,15 @@ def do_build(args):
|
|||
log.debug(f"running script from : {fqdn}")
|
||||
azinstall.run(config, tmpdir, adminuser, private_key_file, public_key_file, fqdn)
|
||||
|
||||
if "cyclecloud" in config:
|
||||
if "projects" in config["cyclecloud"]:
|
||||
log.info("creating cyclecloud projects")
|
||||
azinstall.generate_cc_projects(config, f"{tmpdir}/projects")
|
||||
|
||||
if "clusters" in config["cyclecloud"]:
|
||||
log.info("creating cyclecloud clusters")
|
||||
azinstall.generate_cc_clusters(config, f"{tmpdir}/clusters")
|
||||
|
||||
def do_destroy(args):
|
||||
log.info("reading config file ({})".format(args.config_file))
|
||||
config = azconfig.ConfigFile()
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
|
@ -12,6 +13,9 @@ log = azlog.getLogger(__name__)
|
|||
|
||||
pssh_threads = 50
|
||||
|
||||
def _make_subprocess_error_string(res):
|
||||
return "\n args={}\n return code={}\n stdout={}\n stderr={}".format(res.args, res.returncode, res.stdout.decode("utf-8"), res.stderr.decode("utf-8"))
|
||||
|
||||
def create_jumpbox_setup_script(tmpdir, sshprivkey, sshpubkey):
|
||||
scriptfile = f"{tmpdir}/install/00_install_node_setup.sh"
|
||||
logfile = "install/00_install_node_setup.log"
|
||||
|
@ -24,6 +28,11 @@ cd "$( dirname "${{BASH_SOURCE[0]}}" )/.."
|
|||
|
||||
tag=linux
|
||||
|
||||
if [ ! -f "hostlists/$tag" ]; then
|
||||
echo "no hostlist ($tag), exiting"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# wait for DNS to update for all hostnames
|
||||
for h in $(<hostlists/$tag); do
|
||||
until host $h >/dev/null 2>&1; do
|
||||
|
@ -224,6 +233,27 @@ def __config_has_netapp(cfg):
|
|||
return True
|
||||
return False
|
||||
|
||||
def __copy_script(name, dest):
|
||||
# this looks for the script locally first, else in $azhpc_dir/scripts
|
||||
if os.path.exists(f"scripts/{name}"):
|
||||
if os.path.isdir(f"scripts/{name}"):
|
||||
log.debug(f"using dir from this project ({name})")
|
||||
shutil.copytree(f"scripts/{name}", f"{dest}/{name}")
|
||||
else:
|
||||
log.debug(f"using script from this project ({name})")
|
||||
shutil.copy(f"scripts/{name}", dest)
|
||||
elif os.path.exists(f"{os.getenv('azhpc_dir')}/scripts/{name}"):
|
||||
if os.path.isdir(f"{os.getenv('azhpc_dir')}/scripts/{name}"):
|
||||
log.debug(f"using azhpc dir ({name})")
|
||||
shutil.copytree(f"{os.getenv('azhpc_dir')}/scripts/{name}", f"{dest}/{name}")
|
||||
else:
|
||||
log.debug(f"using azhpc script ({name})")
|
||||
shutil.copy(f"{os.getenv('azhpc_dir')}/scripts/{name}", dest)
|
||||
else:
|
||||
log.error(f"cannot find script/dir ({name})")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def generate_install(cfg, tmpdir, adminuser, sshprivkey, sshpubkey):
|
||||
jb = cfg.get("install_from", None)
|
||||
os.makedirs(tmpdir+"/install")
|
||||
|
@ -249,27 +279,108 @@ def generate_install(cfg, tmpdir, adminuser, sshprivkey, sshpubkey):
|
|||
sys.exit(1)
|
||||
|
||||
for script in [ step["script"] ] + step.get("deps", []):
|
||||
if os.path.exists(f"scripts/{script}"):
|
||||
if os.path.isdir(f"scripts/{script}"):
|
||||
log.debug(f"using dir from this project ({script})")
|
||||
shutil.copytree(f"scripts/{script}", f"{tmpdir}/scripts/{script}")
|
||||
else:
|
||||
log.debug(f"using script from this project ({script})")
|
||||
shutil.copy(f"scripts/{script}", tmpdir+"/scripts")
|
||||
elif os.path.exists(f"{os.getenv('azhpc_dir')}/scripts/{script}"):
|
||||
if os.path.isdir(f"{os.getenv('azhpc_dir')}/scripts/{script}"):
|
||||
log.debug(f"using azhpc dir ({script})")
|
||||
shutil.copytree(f"{os.getenv('azhpc_dir')}/scripts/{script}", f"{tmpdir}/scripts/{script}")
|
||||
else:
|
||||
log.debug(f"using azhpc script ({script})")
|
||||
shutil.copy(f"{os.getenv('azhpc_dir')}/scripts/{script}", tmpdir+"/scripts")
|
||||
else:
|
||||
log.error(f"cannot find script/dir ({script})")
|
||||
sys.exit(1)
|
||||
__copy_script(script, f"{tmpdir}/scripts")
|
||||
|
||||
def _make_subprocess_error_string(res):
|
||||
return "\n args={}\n return code={}\n stdout={}\n stderr={}".format(res.args, res.returncode, res.stdout.decode("utf-8"), res.stderr.decode("utf-8"))
|
||||
def __cyclecloud_upload_project(project_dir):
|
||||
cyclecloud_exe = shutil.which("cyclecloud")
|
||||
if cyclecloud_exe is None:
|
||||
cyclecloud_exe = os.path.join(os.environ["HOME"], "bin", "cyclecloud")
|
||||
if not os.path.isfile(cyclecloud_exe):
|
||||
log.error("cyclecloud cli not found")
|
||||
sys.exit(1)
|
||||
|
||||
cmd = [ cyclecloud_exe, "project", "default_locker", "azure-storage" ]
|
||||
res = subprocess.run(cmd, cwd=project_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
if res.returncode != 0:
|
||||
log.error("invalid returncode"+_make_subprocess_error_string(res))
|
||||
sys.exit(1)
|
||||
|
||||
cmd = [ cyclecloud_exe, "project", "upload" ]
|
||||
res = subprocess.run(cmd, cwd=project_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
if res.returncode != 0:
|
||||
log.error("invalid returncode"+_make_subprocess_error_string(res))
|
||||
sys.exit(1)
|
||||
|
||||
def generate_cc_projects(config, tmpdir):
|
||||
# create projects in filesystem with the following structure:
|
||||
#
|
||||
# <project-name>_<project-version>
|
||||
# ├── project.ini
|
||||
# ├── specs
|
||||
# │ ├── <spec-name>
|
||||
# │ └── cluster-init
|
||||
# │ ├── scripts
|
||||
# │ ├── files
|
||||
#
|
||||
# Azurehpc scripts will be put in the files directory and the scripts
|
||||
# will be generated to call azurehpc scripts with the correct args.
|
||||
for p in config.get("cyclecloud",{}).get("projects", {}):
|
||||
pl = p.split(":")
|
||||
if len(pl) != 3:
|
||||
log.error(f"cannot parse cyclecloud project name - {p}. Format should be PROJECT:SPEC:VERSION.")
|
||||
sys.exit(1)
|
||||
project, spec, version = pl
|
||||
project_dir = f"{tmpdir}/{project}_{version}"
|
||||
|
||||
if not os.path.exists(project_dir):
|
||||
# create directory and project.ini file
|
||||
os.makedirs(project_dir)
|
||||
project_ini = f"""[project]
|
||||
version = {version}
|
||||
type = application
|
||||
name = {project}
|
||||
"""
|
||||
with open(f"{project_dir}/project.ini", "w") as f:
|
||||
f.write(project_ini)
|
||||
|
||||
spec_dir = f"{project_dir}/specs/{spec}"
|
||||
scripts_dir = f"{spec_dir}/cluster-init/scripts"
|
||||
files_dir = f"{spec_dir}/cluster-init/files"
|
||||
os.makedirs(scripts_dir)
|
||||
os.makedirs(files_dir)
|
||||
|
||||
for idx, step in enumerate(config["cyclecloud"]["projects"][p]):
|
||||
script = step["script"]
|
||||
script_file = f"{scripts_dir}/{idx:02d}_{script}"
|
||||
|
||||
# copy script file and dependencies into files_dir
|
||||
for s in [ script ] + step.get("deps", []):
|
||||
__copy_script(s, files_dir)
|
||||
|
||||
# create cluster-init script
|
||||
args = " ".join([ f'"{arg}"' for arg in step.get("args", []) ])
|
||||
script_content = f"""#!/bin/bash
|
||||
chmod +x $CYCLECLOUD_SPEC_PATH/files/*.sh
|
||||
$CYCLECLOUD_SPEC_PATH/files/{script} {args}
|
||||
"""
|
||||
with open(script_file, "w") as f:
|
||||
os.chmod(script_file, 0o755)
|
||||
f.write(script_content)
|
||||
|
||||
log.info(f"uploading project ({project_dir})")
|
||||
__cyclecloud_upload_project(project_dir)
|
||||
|
||||
def __cyclecloud_create_cluster(template, name, paramfile):
|
||||
cmd = [
|
||||
"cyclecloud", "create_cluster", template, name,
|
||||
"-p", paramfile, "--force"
|
||||
]
|
||||
res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
if res.returncode != 0:
|
||||
log.error("invalid returncode"+_make_subprocess_error_string(res))
|
||||
sys.exit(1)
|
||||
|
||||
def generate_cc_clusters(config, tmpdir):
|
||||
os.makedirs(tmpdir)
|
||||
for cluster_name in config.get("cyclecloud",{}).get("clusters", {}):
|
||||
log.info(f"creating cluster {cluster_name}")
|
||||
cluster_template = config["cyclecloud"]["clusters"][cluster_name]["template"]
|
||||
cluster_params = config["cyclecloud"]["clusters"][cluster_name]["parameters"]
|
||||
cluster_json = f"{tmpdir}/{cluster_name}.json"
|
||||
with open(cluster_json, "w") as f:
|
||||
f.write(json.dumps(cluster_params, indent=4))
|
||||
__cyclecloud_create_cluster(cluster_template, cluster_name, cluster_json)
|
||||
|
||||
def __rsync(sshkey, src, dst):
|
||||
cmd = [
|
||||
"rsync", "-a", "-e",
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash
|
||||
if ! rpm -q nfs-utils; then
|
||||
yum install -y nfs-utils
|
||||
fi
|
||||
|
||||
mountpath=$1
|
||||
mountpoint=$2
|
||||
|
||||
mkdir -p $mountpoint
|
||||
|
||||
echo "$mountpath $mountpoint nfs bg,rw,hard,noatime,nolock,rsize=65536,wsize=65536,vers=3,tcp,_netdev 0 0" >>/etc/fstab
|
||||
|
||||
mount $mountpoint
|
||||
|
||||
chmod 777 $mountpoint
|
|
@ -0,0 +1,101 @@
|
|||
#!/bin/bash
|
||||
# This utility script contains helpers functions called in the azurehpc to CycleCloud integration
|
||||
helper=${1,,}
|
||||
JETPACK_HOME=/opt/cycle/jetpack
|
||||
|
||||
if [ ! -e $JETPACK_HOME/bin/jetpack ]; then
|
||||
echo "Not running in a CycleCloud environment exiting"
|
||||
else
|
||||
echo "Running in a CycleCloud environment"
|
||||
fi
|
||||
|
||||
enable_metada_access()
|
||||
{
|
||||
# Enable METADATA SERVICE access if blocked. This is the case with CycleCloud 7.x by default
|
||||
# Delete all rules regarding 169.254.169.254
|
||||
prevent_metadata_access=$($JETPACK_HOME/bin/jetpack config cyclecloud.node.prevent_metadata_access)
|
||||
prevent_metadata_access=${prevent_metadata_access,,}
|
||||
echo "cyclecloud.node.prevent_metadata_access=$prevent_metadata_access"
|
||||
if [ "$prevent_metadata_access" == "true" ]; then
|
||||
echo "Allow Metadata Service access"
|
||||
echo "Dumping IPTABLES"
|
||||
iptables -L
|
||||
rule=$(iptables -S | grep -E 169.254.169.254 | tail -n1)
|
||||
while [ -n "$rule" ]; do
|
||||
delete_rule=$(sed 's/-A/-D/g' <<< $(echo $rule))
|
||||
iptables $delete_rule
|
||||
rule=$(iptables -S | grep -E 169.254.169.254 | tail -n1)
|
||||
done
|
||||
echo "Dumping IPTABLES"
|
||||
iptables -L
|
||||
fi
|
||||
}
|
||||
|
||||
# Disabling jetpack converge can only be done thru a cron as cluster-init scripts are executed before the crontab is updated with the converge entry
|
||||
# If converge is enabled, add a cron to run every minute until the converge entry in the crontab is removed
|
||||
disable_jetpack_converge()
|
||||
{
|
||||
# Remove Jetpack converge from the crontab
|
||||
maintenance_converge=$($JETPACK_HOME/bin/jetpack config cyclecloud.maintenance_converge.enabled)
|
||||
maintenance_converge=${maintenance_converge,,}
|
||||
echo "cyclecloud.maintenance_converge.enabled=$maintenance_converge"
|
||||
if [ "$maintenance_converge" == "true" ]; then
|
||||
# Check if converge is in the crontab and if so remove it, if not add a cron entry to check it every minute
|
||||
grep_for="jetpack converge --mode=maintenance"
|
||||
converge=$(crontab -l | grep "$grep_for")
|
||||
if [ -n "$converge" ]; then
|
||||
echo "Dump crontab"
|
||||
crontab -l
|
||||
echo "Remove Jetpack converge from crontab"
|
||||
crontab -l | grep -v "$grep_for" | crontab -
|
||||
echo "Remove our crontab entry"
|
||||
crontab -l | grep -v "disable_jetpack_converge" | crontab -
|
||||
else
|
||||
# Add an entry in cron only if no one exists
|
||||
disable_jetpack_converge=$(crontab -l | grep disable_jetpack_converge)
|
||||
if [ -z "$disable_jetpack_converge" ]; then
|
||||
echo "*/1 * * * * $0 disable_jetpack_converge >> $JETPACK_HOME/logs/azhpc4cycle.log 2>&1" > crontab-fragment.txt
|
||||
crontab -l | cat - crontab-fragment.txt >crontab.txt
|
||||
crontab crontab.txt
|
||||
fi
|
||||
fi
|
||||
echo "Dump crontab"
|
||||
crontab -l
|
||||
fi
|
||||
}
|
||||
|
||||
disable_fail2ban()
|
||||
{
|
||||
# Disable fail2ban
|
||||
echo "Disabling fail2ban"
|
||||
systemctl stop fail2ban
|
||||
systemctl disable fail2ban
|
||||
}
|
||||
|
||||
fix_pbs_limits()
|
||||
{
|
||||
# Fix PBS limits issue
|
||||
if [ -e /opt/pbs/lib/init.d/limits.pbs_mom ]; then
|
||||
echo "Fixing limits.pbs_mom"
|
||||
sed -i "s/^if /#if /g" /opt/pbs/lib/init.d/limits.pbs_mom
|
||||
sed -i "s/^fi/#fi /g" /opt/pbs/lib/init.d/limits.pbs_mom
|
||||
fi
|
||||
}
|
||||
|
||||
case $helper in
|
||||
enable_metada_access)
|
||||
enable_metada_access
|
||||
;;
|
||||
disable_jetpack_converge)
|
||||
disable_jetpack_converge
|
||||
;;
|
||||
disable_fail2ban)
|
||||
disable_fail2ban
|
||||
;;
|
||||
fix_pbs_limits)
|
||||
fix_pbs_limits
|
||||
;;
|
||||
*)
|
||||
echo "unknown function"
|
||||
;;
|
||||
esac
|
|
@ -0,0 +1,27 @@
|
|||
#!/bin/bash
|
||||
fqdn=$1
|
||||
admin_user=$2
|
||||
password=$3
|
||||
projectstore=$4
|
||||
|
||||
echo "Get cyclecloud_install.py"
|
||||
wget -q "https://raw.githubusercontent.com/dapolloxp/AzureCycleAKSDeployment/master/docker/cyclecloud7/scripts/cyclecloud_install.py"
|
||||
|
||||
key=$(cat /home/$admin_user/.ssh/id_rsa.pub)
|
||||
|
||||
echo "Setup cyclecloud_install.py for $fqdn"
|
||||
python cyclecloud_install.py \
|
||||
--useManagedIdentity \
|
||||
--username $admin_user \
|
||||
--hostname $fqdn \
|
||||
--acceptTerms \
|
||||
--publickey "$key" \
|
||||
--password ${password} \
|
||||
--storageAccount $projectstore
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error : Error installing Cycle Cloud"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "CycleCloud application server installation finished"
|
||||
echo "Navigate to https://$fqdn and login using $admin_user"
|
|
@ -15,7 +15,7 @@ az keyvault show -n $key_vault --output table 2>/dev/null
|
|||
if [ "$?" = "0" ]; then
|
||||
echo "keyvault $key_vault already exists"
|
||||
else
|
||||
az keyvault create --name $key_vault --resource-group $resource_group --output table
|
||||
az keyvault create --name $key_vault --resource-group $resource_group --enable-soft-delete false --output table
|
||||
fi
|
||||
|
||||
# If an SPN appId is provided then consider it's associated SPN secret is already stored in the KV
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
#!/bin/bash
|
||||
# Create all the prerequisites needed to deploy a Cycle Cloud server
|
||||
# - Key vault if it doesn't exists
|
||||
# - Create a Service Principal Name if it doen't exists and store it's secret in the Key Vault
|
||||
# - A storage account
|
||||
# - Generate a password and store it in the Keyvault
|
||||
resource_group=$1
|
||||
key_vault=$2
|
||||
projectstore=$3
|
||||
|
||||
# Create Key Vault to store secrets and keys
|
||||
az keyvault show -n $key_vault --output table 2>/dev/null
|
||||
if [ "$?" = "0" ]; then
|
||||
echo "keyvault $key_vault already exists"
|
||||
else
|
||||
az keyvault create --name $key_vault --resource-group $resource_group --enable-soft-delete false --output table
|
||||
fi
|
||||
|
||||
echo "Creating storage account $projectstore for projects"
|
||||
az storage account create \
|
||||
--name $projectstore \
|
||||
--sku Standard_LRS \
|
||||
--resource-group $resource_group \
|
||||
--kind StorageV2 \
|
||||
--output table
|
||||
|
||||
# If no password is stored, create a random one
|
||||
password=$(az keyvault secret show --name "CycleAdminPassword" --vault-name $key_vault -o json | jq -r '.value')
|
||||
if [ "$password" == "" ]; then
|
||||
echo "No secret CycleAdminPassword retrieved from Key Vault $key_vault"
|
||||
echo "Generate a password"
|
||||
# Prefix password by a * so that the prequisites for Cycle are met (3 of : Capital Case + Lower Case + Number + Extra)
|
||||
password="*$(date +%s | sha256sum | base64 | head -c 16 ; echo)"
|
||||
echo "Store password in Key Vault $key_vault secret CycleAdminPassword"
|
||||
az keyvault secret set --vault-name $key_vault --name "CycleAdminPassword" --value "$password" --output table
|
||||
fi
|
||||
|
|
@ -1,8 +1,23 @@
|
|||
#!/bin/bash
|
||||
# Script to be run on all compute nodes
|
||||
if ! rpm -q epel-release; then
|
||||
yum -y install epel-release
|
||||
fi
|
||||
|
||||
yum -y install epel-release
|
||||
yum -y install git jq htop
|
||||
|
||||
# change access to resource so that temp jobs can be written there
|
||||
chmod 777 /mnt/resource
|
||||
|
||||
# If running on Cycle
|
||||
# - enable METADATA access
|
||||
# - remove Jetpack convergence
|
||||
# - Disable Fail2Ban service
|
||||
# - Fix PBS limits
|
||||
if [ -e $CYCLECLOUD_HOME/bin/jetpack ]; then
|
||||
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||
$DIR/azhpc4cycle.sh enable_metada_access
|
||||
$DIR/azhpc4cycle.sh disable_jetpack_converge
|
||||
$DIR/azhpc4cycle.sh disable_fail2ban
|
||||
$DIR/azhpc4cycle.sh fix_pbs_limits
|
||||
fi
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
#!/bin/bash
|
||||
# Create a Key vault if it doesn't exists
|
||||
resource_group=$1
|
||||
key_vault=$2
|
||||
|
||||
# Create Key Vault to store secrets and keys
|
||||
az keyvault show -n $key_vault --output table 2>/dev/null
|
||||
if [ "$?" = "0" ]; then
|
||||
echo "keyvault $key_vault already exists"
|
||||
else
|
||||
az keyvault create --name $key_vault --resource-group $resource_group --enable-soft-delete false --output table
|
||||
fi
|
|
@ -0,0 +1,17 @@
|
|||
#!/bin/bash
|
||||
# Create a new password and store it in the keyvault.
|
||||
# Keyvault need to exists
|
||||
# If password already exists in Keyvault, don't create a new one
|
||||
key_vault=$1
|
||||
secret_name=$2
|
||||
|
||||
# If no password is stored, create a random one
|
||||
password=$(az keyvault secret show --name "$secret_name" --vault-name $key_vault -o json | jq -r '.value')
|
||||
if [ "$password" == "" ]; then
|
||||
echo "No secret $secret_name retrieved from Key Vault $key_vault"
|
||||
echo "Generate a password"
|
||||
# Prefix password by a * so that the prequisites for Cycle are met (3 of : Capital Case + Lower Case + Number + Extra)
|
||||
password="*$(date +%s | sha256sum | base64 | head -c 16 ; echo)"
|
||||
echo "Store password in Key Vault $key_vault secret $secret_name"
|
||||
az keyvault secret set --vault-name $key_vault --name "$secret_name" --value "$password" --output table
|
||||
fi
|
|
@ -8,7 +8,7 @@ cyclecloud_storage_key=$5
|
|||
|
||||
# Installing CycleCloud CLI
|
||||
echo "Getting CLI binaries..."
|
||||
wget --no-check-certificate https://$fqdn/download/tools/cyclecloud-cli.zip
|
||||
wget -q --no-check-certificate https://$fqdn/download/tools/cyclecloud-cli.zip
|
||||
|
||||
unzip -o cyclecloud-cli.zip
|
||||
pushd cyclecloud-cli-installer/
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
#!/bin/bash
|
||||
|
||||
ADMIN_DOMAIN=$1
|
||||
ADMIN_NAME=$2
|
||||
ADMIN_PASSWORD=$3
|
||||
AD_SERVER=$4
|
||||
|
||||
echo $1 $2 $3 $4
|
||||
|
||||
yum install sssd realmd oddjob oddjob-mkhomedir adcli samba-common samba-common-tools krb5-workstation openldap-clients policycoreutils-python -y
|
||||
|
||||
semodule -i ../files/mkhomedir.pp
|
||||
|
||||
systemctl restart dbus
|
||||
systemctl restart systemd-logind
|
||||
|
||||
NAMESERVER=`getent ahosts $AD_SERVER | awk '{print $1;exit}'`
|
||||
echo "supersede domain-name-servers ${NAMESERVER};" > /etc/dhcp/dhclient.conf
|
||||
echo "append domain-name-servers 168.63.129.16;" >> /etc/dhcp/dhclient.conf
|
||||
systemctl restart NetworkManager
|
||||
|
||||
sleep 10
|
||||
|
||||
echo $ADMIN_PASSWORD| realm join -U $ADMIN_NAME $ADMIN_DOMAIN --verbose
|
||||
|
||||
|
||||
sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/g' /etc/ssh/sshd_config
|
||||
systemctl restart sshd
|
||||
|
||||
sed -i 's@override_homedir.*@override_homedir = /shared/home/%u@' /etc/sssd/sssd.conf
|
||||
sed -i 's@fallback_homedir.*@fallback_homedir = /shared/home/%u@' /etc/sssd/sssd.conf
|
||||
sed -i 's@use_fully_qualified_names.*@use_fully_qualified_names = False@' /etc/sssd/sssd.conf
|
||||
sed -i 's@ldap_id_mapping.*@ldap_id_mapping = True@' /etc/sssd/sssd.conf
|
||||
systemctl restart sssd
|
||||
|
||||
cat <<EOF >/etc/ssh/ssh_config
|
||||
Host *
|
||||
StrictHostKeyChecking no
|
||||
UserKnownHostsFile /dev/null
|
||||
EOF
|
Двоичный файл не отображается.
Загрузка…
Ссылка в новой задаче