Added the packaging of cc_slurm_nhc tests and config files to github actions

2023-03-15 16:35:59 -07:00 · 2023-03-15 16:35:59 -07:00 · 5f9f88f481
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -14,15 +14,21 @@ jobs:
      - name: Build project # This would actually build your project, using zip for an example artifact
        run: |
          cd ./hcheck/hcheck/
-          dotnet build -r linux-x64 --self-contained
+          dotnet build --use-current-runtime
      - name: Publish
-        run: dotnet publish ./hcheck/hcheck/hcheck.csproj -c Release -o release -r linux-x64 --self-contained
+        run: dotnet publish ./hcheck/hcheck/hcheck.csproj -c Release -o release --use-current-runtime
      - name: copy send_log file
        run: cp ./hcheck/hcheck/src/send_log /home/runner/work/cyclecloud-nodehealth/cyclecloud-nodehealth/hcheck/hcheck/bin/Release/net6.0/linux-x64/
      - name: Get the version
        id: get_version
        run: 
         echo ::set-output name=VERSION::${GITHUB_REF#refs/tags/}
+      - name: download nhc scripts
+        run: | 
+         curl https://codeload.github.com/Azure/azurehpc/tar.gz/master |
+         tar -xz --strip=7 azurehpc-master/experimental/cc_slurm_nhc/cc_slurm_nhc/specs/default/cluster-init
+         cp ./files/*.conf /home/runner/work/cyclecloud-nodehealth/specs/default/cluster-init/files/nhc-config/
+         cp ./files/*.nhc /home/runner/work/cyclecloud-nodehealth/specs/default/cluster-init/files/nhc-tests/
      - name: tar files
        run: | 
         echo ${{ steps.get_version.outputs.version }}
--- a/sample-healthchecks/failedonce.sh
+++ b/sample-healthchecks/failedonce.sh
@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+SHARED_DIR_PATH=/shared/home/aevdokimova
+if [ -f $SHARED_DIR_PATH/"failed.txt" ]; 
+then
+rm $SHARED_DIR_PATH/failed.txt; exit 0; 
+else
+echo "There was a hcheck error before" > $SHARED_DIR_PATH/failed.txt;
+echo "failed"; exit 1; 
+fi
+
+node_index=$(jetpack config cyclecloud.node.name | cut -d- -f5)
+if [[ $(expr $node_index % 2) == 0 ]]; then 
+echo failed; exit 1; 
+fi
--- a/specs/default/cluster-init/files/configure_nhc.sh
+++ b/specs/default/cluster-init/files/configure_nhc.sh
@ -98,10 +98,12 @@ HCHECK_JSON=${HCHECK_FILES}${HCHECK_CONFIG}
 #NHC_CONF_FILE_NEW=${CYCLECLOUD_SPEC_PATH}/files/$(jq -r '.nhc.config' ${HCHECK_JSON})

 NHC_CONF_NAME=$(jq -r '.nhc.config' ${HCHECK_JSON})
+NHC_TIMEOUT=$(jq -r '.nhc.timeout' ${HCHECK_JSON})
+

 if [[ $NHC_CONF_NAME == null ]]
 then
-    $NHC_CONF_NAME=$(jetpack config azure.metadata.compute.vmSize).conf
+    NHC_CONF_NAME=$(jetpack config azure.metadata.compute.vmSize).conf
 fi


--- a/specs/default/cluster-init/files/custom-tests/failedonce.sh
+++ b/specs/default/cluster-init/files/custom-tests/failedonce.sh
@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+SHARED_DIR_PATH=/shared/home/aevdokimova
+if [ -f $SHARED_DIR_PATH/"failed.txt" ]; 
+then
+rm $SHARED_DIR_PATH/failed.txt; exit 0; 
+else
+echo "There was a hcheck error before" > $SHARED_DIR_PATH/failed.txt;
+echo "failed"; exit 1; 
+fi
--- a/specs/default/cluster-init/files/nhc-config/Standard_NC6.conf
+++ b/specs/default/cluster-init/files/nhc-config/Standard_NC6.conf
@ -52,7 +52,7 @@
 * || check_fs_mount_rw -t "devtmpfs" -s "devtmpfs" -f "/dev"
 * || check_fs_mount_rw -t "devpts" -s "devpts" -f "/dev/pts"
 * || check_fs_mount_rw -t "tmpfs" -s "tmpfs" -f "/run"
- * || check_fs_mount_rw -t "xfs" -s "/dev/sda2" -f "/"
+ * || check_fs_mount_rw -t "xfs" -s "/dev/sdb2" -f "/"
 * || check_fs_mount_rw -t "securityfs" -s "securityfs" -f "/sys/kernel/security"
 * || check_fs_mount_rw -t "tmpfs" -s "tmpfs" -f "/dev/shm"
 * || check_fs_mount_ro -t "tmpfs" -s "tmpfs" -f "/sys/fs/cgroup"
--- a/specs/default/cluster-init/files/nhc-tests/azure_cuda_bandwidth.nhc
+++ b/specs/default/cluster-init/files/nhc-tests/azure_cuda_bandwidth.nhc
@ -1,49 +0,0 @@
-#!/bin/bash
-
-# Expected bandwidth > 22GB/s
-BANDWIDTHTEST_EXE_PATH=/usr/local/cuda/samples/1_Utilities/bandwidthTest/bandwidthTest
-BANDWIDTHTEST=`basename $BANDWIDTHTEST_EXE_PATH`
-
-#NUMA mapping for NDv4(A100)
-GPU_NUMA=( 1 1 0 0 3 3 2 2 )
-
-
-function check_cuda_bw()
-{
-EXP_CUDA_BW=$1
-for test in "--dtoh" "--htod"
-do
-for device in {0..7};
-do
-   IFS=$'\n'
-   CUDA_BW=$(numactl -N ${GPU_NUMA[$device]} -m ${GPU_NUMA[$device]} $BANDWIDTHTEST_EXE_PATH --device=$device $test)
-   CUDA_BW_RC=$?
-   if [[ $CUDA_BW_RC != 0 ]]
-   then
-      log "$CUDA_BW"
-      die 1 "$FUNCNAME: $BANDWIDTHTEST retuned error code $CUDA_BW_RC "
-      return 1
-   fi
-   CUDA_BW_LINES=( $CUDA_BW )
-   for ((i=0; i<${#CUDA_BW_LINES[*]}; i++))
-   do
-   if [[ "${CUDA_BW_LINES[$i]//32000000}" != "${CUDA_BW_LINES[$i]}" ]]
-   then
-      IFS=$' \t\n'
-      LINE=( ${CUDA_BW_LINES[$i]} )
-      cuda_bandwidth=${LINE[1]}
-      dbg "gpu id=$device: numa domain=${GPU_NUMA[$device]}, Measured CUDA BW $cuda_bandwidth GB/s"
-      break
-   fi
-   done
-   if [[ $cuda_bandwidth < $EXP_CUDA_BW ]]
-   then
-       log "$CUDA_BW"
-       die 1 "$FUNCNAME: $BANDWIDTHTEST, gpu=$device, CUDA BW $test (expected > $EXP_CUDA_BW GB/s, but measured $cuda_bandwidth GB/s"
-       return 1
-   fi
-done
-done
-IFS=$' \t\n'
-return 0
-}
--- a/specs/default/cluster-init/files/nhc-tests/azure_gpu_app_clocks.nhc
+++ b/specs/default/cluster-init/files/nhc-tests/azure_gpu_app_clocks.nhc
@ -1,47 +0,0 @@
-#!/bin/bash
-
-# Check if application GPU clock frequencies are set to their maximum values, if not will attempt to set them.
-
-GPU_QUERY_CLOCKS="clocks.max.memory,clocks.applications.memory,clocks.max.graphics,clocks.applications.graphics"
-
-
-function collect_clocks_data() {
-
-   gpu_freq_out=$(nvidia-smi --query-gpu=$GPU_QUERY_CLOCKS --format=csv,noheader,nounits)
-   gpu_freq_out_rc=$?
-   if [[ $gpu_freq_out_rc != 0 ]]; then
-      log "$gpu_freq_out"
-      die 1 "$FUNCNAME: nvidia-smi (get clock freqs) returned error code $gpu_freq_out_rc"
-   fi
-   IFS=$'\n'
-   gpu_freq_out_lines=( $gpu_freq_out )
-   IFS=$' \t\n'
-}
-
-
-function check_app_gpu_clocks() {
-
-   collect_clocks_data
-
-   for ((i=0; i<${#gpu_freq_out_lines[*]}; i++))
-   do
-      IFS=$', '
-      gpu_freq_out_line=( ${gpu_freq_out_lines[$i]} )
-      IFS=$' \t\n'
-      if [[ ${gpu_freq_out_line[0]} -gt ${gpu_freq_out_line[1]} || ${gpu_freq_out_line[2]} -gt ${gpu_freq_out_line[3]} ]]; then
-	 log "Warning: GPU Id $i: GPU memory freq (max,current)= (${gpu_freq_out_line[0]},${gpu_freq_out_line[1]}) MHz, GPU graphics freq (max,current) = (${gpu_freq_out_line[2]},${gpu_freq_out_line[3]}) MHz"
-	 log "Attempting to set application GPU clock frequencies to maximum frequencies"
-         set_gpu_freq_out=$(nvidia-smi -i $i -ac ${gpu_freq_out_line[0]},${gpu_freq_out_line[2]})
-         set_gpu_freq_out_rc=$?
-         if [[ $set_gpu_freq_out_rc != 0 ]]; then
-            log "$set_gpu_freq_out"
-            die 1 "$FUNCNAME: nvidia-smi (set gpu max clock freqs) returned error code $set_gpu_freq_out_rc"
-         fi
-         log "On GPU Id $i: $set_gpu_freq_out"
-	 return 0
-      else
-         dbg "GPU Id $i: max application GPU clocks are already set, GPU memory is  ${gpu_freq_out_line[0]} MHz and GPU graphics is ${gpu_freq_out_line[2]} MHz"
-	 return 0
-      fi
-done
-}
--- a/specs/default/cluster-init/files/nhc-tests/azure_gpu_clock_throttling.nhc
+++ b/specs/default/cluster-init/files/nhc-tests/azure_gpu_clock_throttling.nhc
@ -1,43 +0,0 @@
-#!/bin/bash
-
-GPU_THROTTLE_QUERY="clocks_throttle_reasons.active"
-
-GPU_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN=0x0000000000000008
-GPU_CLOCKS_THROTTLE_REASON_HW_THERMAL_SLOWDOWN=0x000000000000004
-GPU_CLOCKS_THROTTLE_REASON_APPLICATIONS_CLOCK_SETTINGS=0x0000000000000002
-GPU_CLOCKS_THROTTLE_READON_DISPLAY_SETTINGS=0x0000000000000100
-GPU_CLOCKS_THROTTLE_REASON_GPU_IDLE=0x0000000000000001
-GPU_CLOCKS_THROTTLE_REASON_POWER_BRAKE_SLOWDOWN=0x0000000000000080
-GPU_CLOCKS_THROTTLE_REASON_NONE=0x0000000000000000
-GPU_CLOCKS_THROTTLE_REASON_SW_POWER_CAP=0x0000000000000004
-GPU_CLOCKS_THROTTLE_REASON_SW_THERMAL_SLOWDOWN=0x0000000000000020
-GPU_CLOCKS_THROTTLE_REASON_SYNC_BOOST=0x0000000000000010
-
-
-function collect_gpu_clock_throttle_data() {
-   gpu_clock_throttle_query_out=$(nvidia-smi --query-gpu=$GPU_THROTTLE_QUERY --format=csv,noheader,nounits)
-   gpu_clock_throttle_query_rc=$?
-   if [[ $gpu_clock_throttle_query_rc != 0 ]]; then
-      log "$gpu_clock_throttle_query_out"
-      die 1 "$FUNCNAME: nvidia-smi (get gpu clock throttle data) returned error code $gpu_clock_throttle_query_rc"
-   fi
-   dbg "gpu_clock_throttle_query_out=$gpu_clock_throttle_query_out"
-   IFS=$'\n'
-   gpu_clock_throttle_out_lines=( $gpu_clock_throttle_query_out )
-   IFS=$' \t\n'
-}
-
-function check_gpu_clock_throttling() {
-   collect_gpu_clock_throttle_data
-   for ((i=0; i<${#gpu_clock_throttle_out_lines[*]}; i++))
-   do
-      IFS=$', '
-      gpu_clock_throttle_out_line=( ${gpu_clock_throttle_out_lines[$i]} )
-      IFS=$' \t\n'
-      if [[ ${gpu_clock_throttle_out_line[0]} != $GPU_CLOCKS_THROTTLE_REASON_GPU_IDLE && ${gpu_clock_throttle_out_line[0]} != $GPU_CLOCKS_THROTTLE_REASON_NONE && ${gpu_clock_throttle_out_line[0]} != $GPU_CLOCKS_THROTTLE_REASON_SW_POWER_CAP ]]; then
-         log "Warning: GPU $i throttled, reason=${gpu_clock_throttle_out_line[0]}"
-# Just log GPU throttling (but do not DRAIN node)
-#         die 1 "$FUNCNAME: GPU $i clock throttled, reason=${gpu_clock_throttle_out_line[0]}"
-      fi
-   done
-}
--- a/specs/default/cluster-init/files/nhc-tests/azure_gpu_ecc.nhc
+++ b/specs/default/cluster-init/files/nhc-tests/azure_gpu_ecc.nhc
@ -1,68 +0,0 @@
-#!/bin/bash
-
-# Check for GPU ECC errors
-
-GPU_REMAPPED_ROWS_QUERY="remapped_rows.pending,remapped_rows.failure"
-GPU_QUERY="ecc.errors.uncorrected.volatile.sram,ecc.errors.uncorrected.aggregate.sram,ecc.errors.uncorrected.volatile.dram,ecc.errors.uncorrected.aggregate.dram"
-
-
-function collect_ecc_data() {
-
-   gpu_query_out=$(nvidia-smi --query-gpu=$GPU_QUERY --format=csv,noheader)
-   gpu_query_out_rc=$?
-   if [[ $gpu_query_out_rc != 0 ]]
-   then
-     log "$gpu_query_out"
-     die 1 "$FUNCNAME: nvidia-smi (get gpu uncorrected counts) returned error code $gpu_query_out_rc"
-   fi
-   gpu_remapped_rows_out=$(nvidia-smi --query-remapped-rows=$GPU_REMAPPED_ROWS_QUERY --format=csv,noheader)
-   gpu_remapped_rows_out_rc=$?
-   if [[ $gpu_remapped_rows_out_rc != 0 ]]
-   then
-     log "$gpu_remaped_rows_out"
-     die 1 "$FUNCNAME: nvidia-smi (get gpu remapped rows) returned error code $gpu_freq_out_rc"
-   fi
-   IFS=$'\n'
-   gpu_query_out_lines=( $gpu_query_out ) 
-   gpu_remapped_rows_query_out_lines=( $gpu_remapped_rows_out ) 
-   IFS=$' \t\n'
-}
-
-
-function check_gpu_ecc() {
-
-   collect_ecc_data
-
-   if [[ ${#gpu_query_out_lines[*]} != ${#gpu_remapped_rows_query_out_lines[*]} ]]; then
-      die 1 "$FUNCNAME: nvidia-smi (Number GPU's not correct), (${#gpu_query_out_lines[*]},${#gpu_remapped_rows_query_out_lines[*]})"
-   fi
-   for ((i=0; i<${#gpu_remapped_rows_query_out_lines[*]}; i++))
-   do
-      IFS=$', '
-      gpu_remapped_rows_query_out_line=( ${gpu_remapped_rows_query_out_lines[$i]} ) 
-      gpu_query_out_line=( ${gpu_query_out_lines[$i]} ) 
-      IFS=$' \t\n'
-      if [[ ${gpu_remapped_rows_query_out_line[0]} > 0 ]]
-      then
-         die 1 "$FUNCNAME: GPU id $i: Row remap pending"
-      fi
-      if [[ ${gpu_remapped_rows_query_out_line[1]} > 0 ]]
-      then
-         die 1 "$FUNCNAME: GPU id $i: Row remap error"
-      fi
-      dbg "GPU id $i: No GPU row remap pending or row remap errors"
-      if [[ ${gpu_query_out_line[0]} -gt 0 || ${gpu_query_out_line[1]} -gt 0 ]]; then
-         die 1 "$FUNCNAME: GPU id $i: SRAM Uncorrected ECC error count, (${gpu_query_out_line[0]},${gpu_query_out_line[1]})"
-      else
-         dbg "GPU id $i: Normal SRAM Uncorrectable ECC error count, (${gpu_query_out_line[0]},${gpu_query_out_line[1]})"
-      fi
-      if [[ -n $1 ]]; then
-         if [[ ${gpu_query_out_line[2]} -gt $1 || ${gpu_query_out_line[3]} -gt $1 ]]; then
-	    die 1 "$FUNCNAME: GPU id $i: High DRAM Uncorrected ECC error count, (${gpu_query_out_line[2]},${gpu_query_out_line[3]})"
-         else
-            dbg "GPU id $i: Normal DRAM Uncorrectable ECC error count, (${gpu_query_out_line[2]},${gpu_query_out_line[3]})"
-         fi
-      fi
-done
-return 0
-}
--- a/specs/default/cluster-init/files/nhc-tests/azure_gpu_persistence.nhc
+++ b/specs/default/cluster-init/files/nhc-tests/azure_gpu_persistence.nhc
@ -1,43 +0,0 @@
-#!/bin/bash
-
-# Check GPU persistence mode, if not enabled, attempt to enable.
-
-PERSISTENCE_GPU_QUERY="persistence_mode"
-
-
-function collect_persistence_data() {
-
-   gpu_query_out=$(nvidia-smi --query-gpu=$PERSISTENCE_GPU_QUERY --format=csv,noheader)
-   gpu_query_out_rc=$?
-   if [[ $gpu_query_out_rc != 0 ]]; then
-     log "$gpu_query_out"
-     die 1 "$FUNCNAME: nvidia-smi (persistence mode) returned error code $gpu_query_out_rc"
-   fi
-   IFS=$'\n'
-   gpu_query_out_lines=( $gpu_query_out ) 
-   IFS=$' \t\n'
-}
-
-
-function check_gpu_persistence() {
-
-   collect_persistence_data
-
-   for ((i=0; i<${#gpu_query_out_lines[*]}; i++))
-   do
-      if [[ ${gpu_query_out_lines[$i]} == Disabled ]]; then
-         dbg "$FUNCNAME: GPU id $i: Persistence mode is disabled, will attempt to enable"
-         gpu_persistence_out=$(nvidia-smi -i $i -pm 1)
-         gpu_persistence_out_rc=$?
-         if [[ $gpu_query_out_rc != 0 ]]; then
-            log "$gpu_persistence_out"
-            die 1 "$FUNCNAME: nvidia-smi (enable persistence mode) returned error code $gpu_persistence_out_rc"
-         else
-            dbg "$gpu_persistence_out"
-         fi
-      else
-         dbg "$FUNCNAME: GPU id $i: Persistence mode is already enabled"
-      fi
-done
-return 0
-}
--- a/specs/default/cluster-init/files/nhc-tests/azure_ib_link_flapping.nhc
+++ b/specs/default/cluster-init/files/nhc-tests/azure_ib_link_flapping.nhc
@ -1,49 +0,0 @@
-#!/bin/bash
-  
-#expect to not have any IB link flaps within a given time interval (in hours)
-IB_FLAPPING_LINK_TEST="IB link flapping detected"
-
-
-function check_ib_link_flapping()
-{
-TIME_INTERVAL_HOURS=$1
-lost_carrier_file=/tmp/last_lost_carrier_date
-lost_carrier_line=$(grep -i "ib.*lost carrier" /var/log/syslog | tail -n 1)
-
-if [ "$lost_carrier_line" != "" ]; then
-   dbg "IB link flapping entry in syslog, $lost_carrier_line"
-   lost_carrier_array=( $lost_carrier_line )
-   last_date_str="${lost_carrier_array[0]} ${lost_carrier_array[1]} ${lost_carrier_array[2]}"
-   last_date_sec=$(date --date "$last_date_str" +%s)
-   dbg "last_date_sec = $last_date_sec"
-
-   if [ -f $lost_carrier_file ]; then
-      log "File $lost_carrier_file exists"
-      previous_stored_date=$(cat $lost_carrier_file)
-      dbg "File $lost_carrier_file contains,  $previous_stored_date"
-
-      if [ "$last_date_str" != "$previous_stored_date" ]; then
-         previous_stored_date_sec=$(date --date "$previous_stored_date" +%s)
-         dbg "previous_stored_date_sec=$previous_stored_date_sec"
-         ((diff_secs=$last_date_sec-$previous_stored_date_sec))
-         dbg "IB link flap time interval= $diff_sec sec"
-         ((diff_hours=$diff/(60*60)))
-         dbg "IB link flap time interval= $diff_hours hours"
-
-         if [ $diff_hours -lt $TIME_INTERVAL_HOURS ]; then
-            log "$IB_FLAPPING_LINK_TEST, multiple IB link flapping events within $TIME_INTERVAL_HOURS hours($previous_stored_date, $last_date_str)"
-            die 1 "$FUNCNAME: $IB_FLAPPING_LINK_TEST, multiple IB link flapping events within $TIME_INTERVAL_HOURS hours"
-         else
-            rm $lost_carrier_file
-            log "Time interval > $TIME_INTERVAL_HOURS, Remove $lost_carrier_file"
-         fi
-      fi
-   else
-      log "$lost_carrier_file does not exist, so will create it with $last_date_str"
-      echo $last_date_str > $lost_carrier_file
-   fi
-else
-   dbg "No IB link flapping entry in syslog"
-   return 0
-fi
-}
--- a/specs/default/cluster-init/files/nhc-tests/azure_ib_write_bw_gdr.nhc
+++ b/specs/default/cluster-init/files/nhc-tests/azure_ib_write_bw_gdr.nhc
@ -1,57 +0,0 @@
-#!/bin/bash
-
-# Expected bandwidth > 180 Gbps
-
-IB_WRITE_BW_EXE_PATH=/opt/perftest-4.5/ib_write_bw
-IB_WRITE_BW=`basename $IB_WRITE_BW_EXE_PATH`
-IB_WRITE_BW_DURATION=10
-IB_WRITE_BW_ARGS="-s $(( 1 * 1024 * 1024 )) -D ${IB_WRITE_BW_DURATION} -x 0 -F --report_gbits"
-SLEEP_TIME=5
-
-HOSTNAME=`hostname`
-#NUMA mapping for NDv4(A100)
-GPU_NUMA=( 1 1 0 0 3 3 2 2 )
-
-
-
-function check_ib_bw_gdr()
-{
-EXP_IB_BW=$1
-for device in {0..3};
-do
-   IB_WRITE_BW_OUT1=$(numactl -N ${GPU_NUMA[$device]} -m ${GPU_NUMA[$device]} $IB_WRITE_BW_EXE_PATH $IB_WRITE_BW_ARGS --use_cuda=${device} -d mlx5_ib${device} > /dev/null &)
-   IB_WRITE_BW_OUT1_RC=$?
-   if [[ $IB_WRITE_BW_OUT1_RC != 0 ]]; then
-      log "$IB_WRITE_BW_OUT1"
-      die 1 "$FUNCNAME: $IB_WRITE_BW returned error code $IB_WRITE_BW_OUT1_RC"
-      return 1
-   fi
-   sleep $SLEEP_TIME
-   device_peer=$(( device+4 ))
-   IB_WRITE_BW_OUT2=$(numactl -N ${GPU_NUMA[$device_peer]} -m ${GPU_NUMA[$device_peer]} $IB_WRITE_BW_EXE_PATH $IB_WRITE_BW_ARGS --use_cuda=${device_peer} -d mlx5_ib${device_peer} $HOSTNAME)
-   IB_WRITE_BW_OUT2_RC=$?
-   if [[ $IB_WRITE_BW_OUT2_RC != 0 ]]; then
-      log "$IB_WRITE_BW_OUT2"
-      die 1 "$FUNCNAME: $IB_WRITE_BW returned error code $IB_WRITE_BW_OUT2_RC"
-      return 1
-   fi
-   IFS=$'\n'
-   IB_WRITE_BW_OUT2_LINES=( $IB_WRITE_BW_OUT2 )
-   IFS=$' \t\n'
-   for ((i=0; i<${#IB_WRITE_BW_OUT2_LINES[*]}; i++))
-   do
-      if [[ "${IB_WRITE_BW_OUT2_LINES[$i]//1048576}" != "${IB_WRITE_BW_OUT2_LINES[$i]}" ]]; then
-         LINE=( ${IB_WRITE_BW_OUT2_LINES[$i]} )
-         ib_bandwidth=${LINE[3]}
-         dbg "IB devices=mlx5_ib${device}, mlx5_ib${device_peer}: numa domains=${GPU_NUMA[$device]},${GPU_NUMA[$device_peer]}, Measured IB BW $ib_bandwidth Gbps"
-         break
-      fi
-   done
-   if [[ $ib_bandwidth < $EXP_IB_BW ]]; then
-       log "$IB_WRITE_BW_OUT2"
-       die 1 "$FUNCNAME: $IB_WRITE_BW, IB=mlx5_ib${device}, mlx5_ib${device_peer}, IB BW (expected > $EXP_IB_BW Gbps, but measured $ib_bandwidth Gbps"
-       return 1
-   fi
-done
-return 0
-}
--- a/specs/default/cluster-init/files/nhc-tests/azure_nccl_allreduce_ib_loopback.nhc
+++ b/specs/default/cluster-init/files/nhc-tests/azure_nccl_allreduce_ib_loopback.nhc
@ -1,48 +0,0 @@
-#!/bin/bash
-  
-# Check for IB issues by running NCCL allreduce disabling NCCL shared memory.
-# Expected performance is > 19 GB/s
-
-MPI_ARGS="-np 8 --map-by ppr:8:node -bind-to numa -mca coll_hcoll_enable 0 --allow-run-as-root"
-ENVIRON_VARS="-x LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x UCX_IB_PCI_RELAXED_ORDERING=on -x UCX_TLS=tcp -x UCX_NET_DEVICES=eth0 -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_SOCKET_IFNAME=eth0 -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml -x NCCL_SHM_DISABLE=1 -x NCCL_P2P_DISABLE=1"
-NCCL_ARGS="-b 500M -f 2 -g 1 -e 1G -c 1"
-
-
-function collect_nccl_allreduce_ib_loopback_data() {
-
-   nccl_allreduce_ib_loopback_out=$(source /etc/profile.d/modules.sh && module load mpi/hpcx && mpirun $MPI_ARGS $ENVIRON_VARS /opt/nccl-tests/build/all_reduce_perf $NCCL_ARGS)
-   nccl_allreduce_ib_loopback_out_rc=$?
-   if [[ $nccl_allreduce_ib_loopback_out_rc != 0 ]]; then
-      log "nccl_allreduce_ib_loopback_freq_out"
-      die 1 "$FUNCNAME: nccl_allreduce (IB loopback) returned error code $nccl_allreduce_ib_loopback_out_rc"
-   fi
-   IFS=$'\n'
-   nccl_allreduce_ib_loopback_out_lines=( $nccl_allreduce_ib_loopback_out )
-   IFS=$' \t\n'
-}
-
-
-function check_nccl_allreduce_ib_loopback() {
-
-   EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW=$1
-   collect_nccl_allreduce_ib_loopback_data
-
-   for ((i=0; i<${#nccl_allreduce_ib_loopback_out_lines[*]}; i++))
-   do
-      if [[ "${nccl_allreduce_ib_loopback_out_lines[$i]//bandwidth}" != "${nccl_allreduce_ib_loopback_out_lines[$i]}" ]]
-      then
-         IFS=$' \t\n'
-         nccl_allreduce_ib_loopback_out_line=( ${nccl_allreduce_ib_loopback_out_lines[$i]} )
-         avg_bus_bw=${nccl_allreduce_ib_loopback_out_line[5]}
-         dbg "Measured Avg NCCL allreduce ib loopback bus BW $avg_bus_bw GB/s"
-         break
-      fi
-   done
-   dbg "Measured Avg NCCL allreduce IB loopback bus BW=$avg_bus_bw, Expected NCCL allreduce IB loopback BW=$EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW"
-   if [[ $avg_bus_bw < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW ]]
-   then
-      log "$nccl_allreduce_ib_loopback_out"
-      die 1 "$FUNCNAME: NCCL allreduce IB loopback, BUS BW (expected > $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s, but measured $avg_bus_bw GB/s"
-      return 1
-   fi
-}
--- a/specs/default/cluster-init/files/nhc-tests/csc_nvidia_smi.nhc
+++ b/specs/default/cluster-init/files/nhc-tests/csc_nvidia_smi.nhc
@ -1,67 +0,0 @@
-# NHC - nVidia GPU Checks
-#
-# Johan Guldmyr <jguldmyr@csc.fi>
-# 17 Dec 2015
-#
-
-NVIDIA_SMI_HEALTHMON="${NVIDIA_SMI_HEALTHMON:-nvidia-smi}"
-NVIDIA_SMI_HEALTHMON_ARGS="${NVIDIA_SMI_HEALTHMON_ARGS}"
-
-NVSMI_HEALTHMON_LINES=( )
-NVSMI_HEALTHMON_OUTPUT=""
-NVSMI_HEALTHMON_RC=""
-
-export NVSMI_HEALTHMON_LINES NVSMI_HEALTHMON_OUTPUT NVSMI_HEALTHMON_RC
-
-function nhc_nvsmi_gather_data() {
-    local IFS
-
-    NVSMI_HEALTHMON_OUTPUT=$($NVIDIA_SMI_HEALTHMON $NVIDIA_SMI_HEALTHMON_ARGS 2>/dev/null)
-    NVSMI_HEALTHMON_RC=$?
-    IFS=$'\n'
-    NVSMI_HEALTHMON_LINES=( $NVSMI_HEALTHMON_OUTPUT )
-}
-
-# Run the nvidia-smi utility and verify that all GPUs
-# are functioning properly.
-function check_nvsmi_healthmon() {
-    if [[ -z "$NVSMI_HEALTHMON_RC" ]]; then
-        nhc_nvsmi_gather_data
-    fi
-
-    if [[ $NVSMI_HEALTHMON_RC -eq 0 ]]; then
-        dbg "$FUNCNAME:  $NVIDIA_SMI_HEALTHMON completed successfully"
-        return 0
-    elif [[ $NVSMI_HEALTHMON_RC -eq 4 ]]; then
-        die 1 "$FUNCNAME:  $NVIDIA_SMI_HEALTHMON:  Permission denied"
-        return 1
-    elif [[ $NVSMI_HEALTHMON_RC -eq 8 ]]; then
-        die 1 "$FUNCNAME:  $NVIDIA_SMI_HEALTHMON:  Power cables not attached"
-        return 1
-    elif [[ $NVSMI_HEALTHMON_RC -eq 2 ]]; then
-        die 1 "$FUNCNAME:  $NVIDIA_SMI_HEALTHMON:  Invalid argument or flag"
-        return 1
-    elif [[ $NVSMI_HEALTHMON_RC -eq 9 ]]; then
-        die 1 "$FUNCNAME:  $NVIDIA_SMI_HEALTHMON:  NVIDIA driver not loaded"
-        return 1
-    elif [[ $NVSMI_HEALTHMON_RC -eq 10 ]]; then
-        die 1 "$FUNCNAME:  $NVIDIA_SMI_HEALTHMON:  Interrupt issue with a GPU"
-        return 1
-    elif [[ $NVSMI_HEALTHMON_RC -eq 12 ]]; then
-        die 1 "$FUNCNAME:  $NVIDIA_SMI_HEALTHMON:  NVML shared library could not be found"
-        return 1
-    elif [[ $NVSMI_HEALTHMON_RC -eq 14 ]]; then
-        die 1 "$FUNCNAME:  $NVIDIA_SMI_HEALTHMON:  InfoROM is corrupted"
-        return 1
-    elif [[ $NVSMI_HEALTHMON_RC -eq 15 ]]; then
-        die 1 "$FUNCNAME:  $NVIDIA_SMI_HEALTHMON:  The GPU has fallen off the bus or has otherwise become inaccessible"
-        return 1
-    elif [[ $NVSMI_HEALTHMON_RC -gt 127 ]]; then
-        die 1 "$FUNCNAME:  $NVIDIA_SMI_HEALTHMON:  Caught fatal signal $((NVSMI_HEALTHMON_RC&0x7f))"
-        return 1
-    else
-        log "$FUNCNAME:  $NVIDIA_SMI_HEALTHMON:  \"$NVSMI_HEALTHMON_OUTPUT\""
-        die 1 "$FUNCNAME:  $NVIDIA_SMI_HEALTHMON:  Returned failure code $NVSMI_HEALTHMON_RC"
-        return 1
-    fi
-}
--- a/specs/default/cluster-init/scripts/000_install_nhc.sh
+++ b/specs/default/cluster-init/scripts/000_install_nhc.sh
@ -1,6 +1,4 @@
 #!/bin/bash
-set -x
-set -e

 chmod +x $CYCLECLOUD_SPEC_PATH/files/install_nhc.sh
 $CYCLECLOUD_SPEC_PATH/files/install_nhc.sh
--- a/specs/default/cluster-init/scripts/001_configure_nhc.sh
+++ b/specs/default/cluster-init/scripts/001_configure_nhc.sh
@ -1,19 +1,4 @@
 #!/bin/bash
-set -x
-#set -e

-sudo -i
-
-platform_family=$(jetpack config platform_family)
-
-if [ $platform_family == "ubuntu" ]; then 
-    apt install -y jq
-fi
-
-if [ $platform_family == "rhel" ]; then 
-    yum install -y jq
-fi
-
-jetpack config healthchecks --json > $CYCLECLOUD_SPEC_PATH/files/healthchecks.json
 chmod +x $CYCLECLOUD_SPEC_PATH/files/configure_nhc.sh
 $CYCLECLOUD_SPEC_PATH/files/configure_nhc.sh
--- a/specs/default/cluster-init/scripts/003_run_reframe.sh
+++ b/specs/default/cluster-init/scripts/003_run_reframe.sh
@ -50,5 +50,5 @@ REPORT_PATH=$(jq -r '.report' ${HCHECK_SETTINGS_PATH})

 APPLICATIONINSIGHTS_CONNECTION_STRING=$(jq -r '.appinsights.ConnectString' ${HCHECK_SETTINGS_PATH})
 INSTRUMENTATION_KEY=$(jq -r '.appinsights.InstrumentationKey' ${HCHECK_SETTINGS_PATH})
-$INSTALL_DIR/linux-x64/hcheck -k $INSTALL_DIR/reframe/azure_nhc/run_level_2  --append --rpath $REPORT_PATH --reframe $INSTALL_DIR/reframe/bin/reframe --config $INSTALL_DIR/reframe/azure_nhc/config/${reframe_cfg}
-$INSTALL_DIR/linux-x64/hcheck --rpath $REPORT_PATH --fin --appin $INSTRUMENTATION_KEY --rscript $INSTALL_DIR/sbin/send_log
+#$INSTALL_DIR/linux-x64/hcheck -k $INSTALL_DIR/reframe/azure_nhc/run_level_2  --append --rpath $REPORT_PATH --reframe $INSTALL_DIR/reframe/bin/reframe --config $INSTALL_DIR/reframe/azure_nhc/config/${reframe_cfg}
+#$INSTALL_DIR/linux-x64/hcheck --rpath $REPORT_PATH --fin --appin $INSTRUMENTATION_KEY --rscript $INSTALL_DIR/sbin/send_log
--- a/templates/slurm.txt
+++ b/templates/slurm.txt
@ -192,6 +192,7 @@ Autoscale = $Autoscale
            [[[configuration healthchecks.nhc]]]
            config = $NHCConf
            log = /var/log/nhc.log
+            timeout = $NHCTimeout



@ -580,7 +581,7 @@ Order = 20

        [[[parameter UsePublicNetwork]]]
        Label = Public Head Node
-        DefaultValue = true
+        DefaultValue = false
        ParameterType = Boolean
        Config.Label = Access scheduler node from the Internet

@ -599,6 +600,10 @@ Order = 20
        Label = NHC Config Name
        Description = The name of the configuration file used to perform NHC healthchecks

+         [[[parameter NHCTimeout]]]
+        Label = NHC Test Timeour
+        Description = The number of miliseconds it would take for NHC tests to time out
+
        [[[parameter CustomScriptPattern]]]
        Label = User Scripts Pattern
        Description = The pattern used to detect custom tests