Added the packaging of cc_slurm_nhc tests and config files to github actions

This commit is contained in:
Anastasiia Evdokimova 2023-03-15 16:35:59 -07:00
Родитель da344b26a3
Коммит 5f9f88f481
18 изменённых файлов: 43 добавлений и 495 удалений

10
.github/workflows/release.yml поставляемый
Просмотреть файл

@ -14,15 +14,21 @@ jobs:
- name: Build project # This would actually build your project, using zip for an example artifact
run: |
cd ./hcheck/hcheck/
dotnet build -r linux-x64 --self-contained
dotnet build --use-current-runtime
- name: Publish
run: dotnet publish ./hcheck/hcheck/hcheck.csproj -c Release -o release -r linux-x64 --self-contained
run: dotnet publish ./hcheck/hcheck/hcheck.csproj -c Release -o release --use-current-runtime
- name: copy send_log file
run: cp ./hcheck/hcheck/src/send_log /home/runner/work/cyclecloud-nodehealth/cyclecloud-nodehealth/hcheck/hcheck/bin/Release/net6.0/linux-x64/
- name: Get the version
id: get_version
run:
echo ::set-output name=VERSION::${GITHUB_REF#refs/tags/}
- name: download nhc scripts
run: |
curl https://codeload.github.com/Azure/azurehpc/tar.gz/master |
tar -xz --strip=7 azurehpc-master/experimental/cc_slurm_nhc/cc_slurm_nhc/specs/default/cluster-init
cp ./files/*.conf /home/runner/work/cyclecloud-nodehealth/specs/default/cluster-init/files/nhc-config/
cp ./files/*.nhc /home/runner/work/cyclecloud-nodehealth/specs/default/cluster-init/files/nhc-tests/
- name: tar files
run: |
echo ${{ steps.get_version.outputs.version }}

Просмотреть файл

@ -0,0 +1,14 @@
#!/usr/bin/env bash
SHARED_DIR_PATH=/shared/home/aevdokimova
if [ -f $SHARED_DIR_PATH/"failed.txt" ];
then
rm $SHARED_DIR_PATH/failed.txt; exit 0;
else
echo "There was a hcheck error before" > $SHARED_DIR_PATH/failed.txt;
echo "failed"; exit 1;
fi
node_index=$(jetpack config cyclecloud.node.name | cut -d- -f5)
if [[ $(expr $node_index % 2) == 0 ]]; then
echo failed; exit 1;
fi

Просмотреть файл

@ -98,10 +98,12 @@ HCHECK_JSON=${HCHECK_FILES}${HCHECK_CONFIG}
#NHC_CONF_FILE_NEW=${CYCLECLOUD_SPEC_PATH}/files/$(jq -r '.nhc.config' ${HCHECK_JSON})
NHC_CONF_NAME=$(jq -r '.nhc.config' ${HCHECK_JSON})
NHC_TIMEOUT=$(jq -r '.nhc.timeout' ${HCHECK_JSON})
if [[ $NHC_CONF_NAME == null ]]
then
$NHC_CONF_NAME=$(jetpack config azure.metadata.compute.vmSize).conf
NHC_CONF_NAME=$(jetpack config azure.metadata.compute.vmSize).conf
fi

Просмотреть файл

@ -0,0 +1,9 @@
#!/usr/bin/env bash
SHARED_DIR_PATH=/shared/home/aevdokimova
if [ -f $SHARED_DIR_PATH/"failed.txt" ];
then
rm $SHARED_DIR_PATH/failed.txt; exit 0;
else
echo "There was a hcheck error before" > $SHARED_DIR_PATH/failed.txt;
echo "failed"; exit 1;
fi

Просмотреть файл

@ -52,7 +52,7 @@
* || check_fs_mount_rw -t "devtmpfs" -s "devtmpfs" -f "/dev"
* || check_fs_mount_rw -t "devpts" -s "devpts" -f "/dev/pts"
* || check_fs_mount_rw -t "tmpfs" -s "tmpfs" -f "/run"
* || check_fs_mount_rw -t "xfs" -s "/dev/sda2" -f "/"
* || check_fs_mount_rw -t "xfs" -s "/dev/sdb2" -f "/"
* || check_fs_mount_rw -t "securityfs" -s "securityfs" -f "/sys/kernel/security"
* || check_fs_mount_rw -t "tmpfs" -s "tmpfs" -f "/dev/shm"
* || check_fs_mount_ro -t "tmpfs" -s "tmpfs" -f "/sys/fs/cgroup"

Просмотреть файл

@ -1,49 +0,0 @@
#!/bin/bash
# Expected bandwidth > 22GB/s
BANDWIDTHTEST_EXE_PATH=/usr/local/cuda/samples/1_Utilities/bandwidthTest/bandwidthTest
BANDWIDTHTEST=`basename $BANDWIDTHTEST_EXE_PATH`
#NUMA mapping for NDv4(A100)
GPU_NUMA=( 1 1 0 0 3 3 2 2 )
function check_cuda_bw()
{
EXP_CUDA_BW=$1
for test in "--dtoh" "--htod"
do
for device in {0..7};
do
IFS=$'\n'
CUDA_BW=$(numactl -N ${GPU_NUMA[$device]} -m ${GPU_NUMA[$device]} $BANDWIDTHTEST_EXE_PATH --device=$device $test)
CUDA_BW_RC=$?
if [[ $CUDA_BW_RC != 0 ]]
then
log "$CUDA_BW"
die 1 "$FUNCNAME: $BANDWIDTHTEST retuned error code $CUDA_BW_RC "
return 1
fi
CUDA_BW_LINES=( $CUDA_BW )
for ((i=0; i<${#CUDA_BW_LINES[*]}; i++))
do
if [[ "${CUDA_BW_LINES[$i]//32000000}" != "${CUDA_BW_LINES[$i]}" ]]
then
IFS=$' \t\n'
LINE=( ${CUDA_BW_LINES[$i]} )
cuda_bandwidth=${LINE[1]}
dbg "gpu id=$device: numa domain=${GPU_NUMA[$device]}, Measured CUDA BW $cuda_bandwidth GB/s"
break
fi
done
if [[ $cuda_bandwidth < $EXP_CUDA_BW ]]
then
log "$CUDA_BW"
die 1 "$FUNCNAME: $BANDWIDTHTEST, gpu=$device, CUDA BW $test (expected > $EXP_CUDA_BW GB/s, but measured $cuda_bandwidth GB/s"
return 1
fi
done
done
IFS=$' \t\n'
return 0
}

Просмотреть файл

@ -1,47 +0,0 @@
#!/bin/bash
# Check if application GPU clock frequencies are set to their maximum values, if not will attempt to set them.
GPU_QUERY_CLOCKS="clocks.max.memory,clocks.applications.memory,clocks.max.graphics,clocks.applications.graphics"
function collect_clocks_data() {
gpu_freq_out=$(nvidia-smi --query-gpu=$GPU_QUERY_CLOCKS --format=csv,noheader,nounits)
gpu_freq_out_rc=$?
if [[ $gpu_freq_out_rc != 0 ]]; then
log "$gpu_freq_out"
die 1 "$FUNCNAME: nvidia-smi (get clock freqs) returned error code $gpu_freq_out_rc"
fi
IFS=$'\n'
gpu_freq_out_lines=( $gpu_freq_out )
IFS=$' \t\n'
}
function check_app_gpu_clocks() {
collect_clocks_data
for ((i=0; i<${#gpu_freq_out_lines[*]}; i++))
do
IFS=$', '
gpu_freq_out_line=( ${gpu_freq_out_lines[$i]} )
IFS=$' \t\n'
if [[ ${gpu_freq_out_line[0]} -gt ${gpu_freq_out_line[1]} || ${gpu_freq_out_line[2]} -gt ${gpu_freq_out_line[3]} ]]; then
log "Warning: GPU Id $i: GPU memory freq (max,current)= (${gpu_freq_out_line[0]},${gpu_freq_out_line[1]}) MHz, GPU graphics freq (max,current) = (${gpu_freq_out_line[2]},${gpu_freq_out_line[3]}) MHz"
log "Attempting to set application GPU clock frequencies to maximum frequencies"
set_gpu_freq_out=$(nvidia-smi -i $i -ac ${gpu_freq_out_line[0]},${gpu_freq_out_line[2]})
set_gpu_freq_out_rc=$?
if [[ $set_gpu_freq_out_rc != 0 ]]; then
log "$set_gpu_freq_out"
die 1 "$FUNCNAME: nvidia-smi (set gpu max clock freqs) returned error code $set_gpu_freq_out_rc"
fi
log "On GPU Id $i: $set_gpu_freq_out"
return 0
else
dbg "GPU Id $i: max application GPU clocks are already set, GPU memory is ${gpu_freq_out_line[0]} MHz and GPU graphics is ${gpu_freq_out_line[2]} MHz"
return 0
fi
done
}

Просмотреть файл

@ -1,43 +0,0 @@
#!/bin/bash
GPU_THROTTLE_QUERY="clocks_throttle_reasons.active"
GPU_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN=0x0000000000000008
GPU_CLOCKS_THROTTLE_REASON_HW_THERMAL_SLOWDOWN=0x000000000000004
GPU_CLOCKS_THROTTLE_REASON_APPLICATIONS_CLOCK_SETTINGS=0x0000000000000002
GPU_CLOCKS_THROTTLE_READON_DISPLAY_SETTINGS=0x0000000000000100
GPU_CLOCKS_THROTTLE_REASON_GPU_IDLE=0x0000000000000001
GPU_CLOCKS_THROTTLE_REASON_POWER_BRAKE_SLOWDOWN=0x0000000000000080
GPU_CLOCKS_THROTTLE_REASON_NONE=0x0000000000000000
GPU_CLOCKS_THROTTLE_REASON_SW_POWER_CAP=0x0000000000000004
GPU_CLOCKS_THROTTLE_REASON_SW_THERMAL_SLOWDOWN=0x0000000000000020
GPU_CLOCKS_THROTTLE_REASON_SYNC_BOOST=0x0000000000000010
function collect_gpu_clock_throttle_data() {
gpu_clock_throttle_query_out=$(nvidia-smi --query-gpu=$GPU_THROTTLE_QUERY --format=csv,noheader,nounits)
gpu_clock_throttle_query_rc=$?
if [[ $gpu_clock_throttle_query_rc != 0 ]]; then
log "$gpu_clock_throttle_query_out"
die 1 "$FUNCNAME: nvidia-smi (get gpu clock throttle data) returned error code $gpu_clock_throttle_query_rc"
fi
dbg "gpu_clock_throttle_query_out=$gpu_clock_throttle_query_out"
IFS=$'\n'
gpu_clock_throttle_out_lines=( $gpu_clock_throttle_query_out )
IFS=$' \t\n'
}
function check_gpu_clock_throttling() {
collect_gpu_clock_throttle_data
for ((i=0; i<${#gpu_clock_throttle_out_lines[*]}; i++))
do
IFS=$', '
gpu_clock_throttle_out_line=( ${gpu_clock_throttle_out_lines[$i]} )
IFS=$' \t\n'
if [[ ${gpu_clock_throttle_out_line[0]} != $GPU_CLOCKS_THROTTLE_REASON_GPU_IDLE && ${gpu_clock_throttle_out_line[0]} != $GPU_CLOCKS_THROTTLE_REASON_NONE && ${gpu_clock_throttle_out_line[0]} != $GPU_CLOCKS_THROTTLE_REASON_SW_POWER_CAP ]]; then
log "Warning: GPU $i throttled, reason=${gpu_clock_throttle_out_line[0]}"
# Just log GPU throttling (but do not DRAIN node)
# die 1 "$FUNCNAME: GPU $i clock throttled, reason=${gpu_clock_throttle_out_line[0]}"
fi
done
}

Просмотреть файл

@ -1,68 +0,0 @@
#!/bin/bash
# Check for GPU ECC errors
GPU_REMAPPED_ROWS_QUERY="remapped_rows.pending,remapped_rows.failure"
GPU_QUERY="ecc.errors.uncorrected.volatile.sram,ecc.errors.uncorrected.aggregate.sram,ecc.errors.uncorrected.volatile.dram,ecc.errors.uncorrected.aggregate.dram"
function collect_ecc_data() {
gpu_query_out=$(nvidia-smi --query-gpu=$GPU_QUERY --format=csv,noheader)
gpu_query_out_rc=$?
if [[ $gpu_query_out_rc != 0 ]]
then
log "$gpu_query_out"
die 1 "$FUNCNAME: nvidia-smi (get gpu uncorrected counts) returned error code $gpu_query_out_rc"
fi
gpu_remapped_rows_out=$(nvidia-smi --query-remapped-rows=$GPU_REMAPPED_ROWS_QUERY --format=csv,noheader)
gpu_remapped_rows_out_rc=$?
if [[ $gpu_remapped_rows_out_rc != 0 ]]
then
log "$gpu_remaped_rows_out"
die 1 "$FUNCNAME: nvidia-smi (get gpu remapped rows) returned error code $gpu_freq_out_rc"
fi
IFS=$'\n'
gpu_query_out_lines=( $gpu_query_out )
gpu_remapped_rows_query_out_lines=( $gpu_remapped_rows_out )
IFS=$' \t\n'
}
function check_gpu_ecc() {
collect_ecc_data
if [[ ${#gpu_query_out_lines[*]} != ${#gpu_remapped_rows_query_out_lines[*]} ]]; then
die 1 "$FUNCNAME: nvidia-smi (Number GPU's not correct), (${#gpu_query_out_lines[*]},${#gpu_remapped_rows_query_out_lines[*]})"
fi
for ((i=0; i<${#gpu_remapped_rows_query_out_lines[*]}; i++))
do
IFS=$', '
gpu_remapped_rows_query_out_line=( ${gpu_remapped_rows_query_out_lines[$i]} )
gpu_query_out_line=( ${gpu_query_out_lines[$i]} )
IFS=$' \t\n'
if [[ ${gpu_remapped_rows_query_out_line[0]} > 0 ]]
then
die 1 "$FUNCNAME: GPU id $i: Row remap pending"
fi
if [[ ${gpu_remapped_rows_query_out_line[1]} > 0 ]]
then
die 1 "$FUNCNAME: GPU id $i: Row remap error"
fi
dbg "GPU id $i: No GPU row remap pending or row remap errors"
if [[ ${gpu_query_out_line[0]} -gt 0 || ${gpu_query_out_line[1]} -gt 0 ]]; then
die 1 "$FUNCNAME: GPU id $i: SRAM Uncorrected ECC error count, (${gpu_query_out_line[0]},${gpu_query_out_line[1]})"
else
dbg "GPU id $i: Normal SRAM Uncorrectable ECC error count, (${gpu_query_out_line[0]},${gpu_query_out_line[1]})"
fi
if [[ -n $1 ]]; then
if [[ ${gpu_query_out_line[2]} -gt $1 || ${gpu_query_out_line[3]} -gt $1 ]]; then
die 1 "$FUNCNAME: GPU id $i: High DRAM Uncorrected ECC error count, (${gpu_query_out_line[2]},${gpu_query_out_line[3]})"
else
dbg "GPU id $i: Normal DRAM Uncorrectable ECC error count, (${gpu_query_out_line[2]},${gpu_query_out_line[3]})"
fi
fi
done
return 0
}

Просмотреть файл

@ -1,43 +0,0 @@
#!/bin/bash
# Check GPU persistence mode, if not enabled, attempt to enable.
PERSISTENCE_GPU_QUERY="persistence_mode"
function collect_persistence_data() {
gpu_query_out=$(nvidia-smi --query-gpu=$PERSISTENCE_GPU_QUERY --format=csv,noheader)
gpu_query_out_rc=$?
if [[ $gpu_query_out_rc != 0 ]]; then
log "$gpu_query_out"
die 1 "$FUNCNAME: nvidia-smi (persistence mode) returned error code $gpu_query_out_rc"
fi
IFS=$'\n'
gpu_query_out_lines=( $gpu_query_out )
IFS=$' \t\n'
}
function check_gpu_persistence() {
collect_persistence_data
for ((i=0; i<${#gpu_query_out_lines[*]}; i++))
do
if [[ ${gpu_query_out_lines[$i]} == Disabled ]]; then
dbg "$FUNCNAME: GPU id $i: Persistence mode is disabled, will attempt to enable"
gpu_persistence_out=$(nvidia-smi -i $i -pm 1)
gpu_persistence_out_rc=$?
if [[ $gpu_query_out_rc != 0 ]]; then
log "$gpu_persistence_out"
die 1 "$FUNCNAME: nvidia-smi (enable persistence mode) returned error code $gpu_persistence_out_rc"
else
dbg "$gpu_persistence_out"
fi
else
dbg "$FUNCNAME: GPU id $i: Persistence mode is already enabled"
fi
done
return 0
}

Просмотреть файл

@ -1,49 +0,0 @@
#!/bin/bash
#expect to not have any IB link flaps within a given time interval (in hours)
IB_FLAPPING_LINK_TEST="IB link flapping detected"
function check_ib_link_flapping()
{
TIME_INTERVAL_HOURS=$1
lost_carrier_file=/tmp/last_lost_carrier_date
lost_carrier_line=$(grep -i "ib.*lost carrier" /var/log/syslog | tail -n 1)
if [ "$lost_carrier_line" != "" ]; then
dbg "IB link flapping entry in syslog, $lost_carrier_line"
lost_carrier_array=( $lost_carrier_line )
last_date_str="${lost_carrier_array[0]} ${lost_carrier_array[1]} ${lost_carrier_array[2]}"
last_date_sec=$(date --date "$last_date_str" +%s)
dbg "last_date_sec = $last_date_sec"
if [ -f $lost_carrier_file ]; then
log "File $lost_carrier_file exists"
previous_stored_date=$(cat $lost_carrier_file)
dbg "File $lost_carrier_file contains, $previous_stored_date"
if [ "$last_date_str" != "$previous_stored_date" ]; then
previous_stored_date_sec=$(date --date "$previous_stored_date" +%s)
dbg "previous_stored_date_sec=$previous_stored_date_sec"
((diff_secs=$last_date_sec-$previous_stored_date_sec))
dbg "IB link flap time interval= $diff_sec sec"
((diff_hours=$diff/(60*60)))
dbg "IB link flap time interval= $diff_hours hours"
if [ $diff_hours -lt $TIME_INTERVAL_HOURS ]; then
log "$IB_FLAPPING_LINK_TEST, multiple IB link flapping events within $TIME_INTERVAL_HOURS hours($previous_stored_date, $last_date_str)"
die 1 "$FUNCNAME: $IB_FLAPPING_LINK_TEST, multiple IB link flapping events within $TIME_INTERVAL_HOURS hours"
else
rm $lost_carrier_file
log "Time interval > $TIME_INTERVAL_HOURS, Remove $lost_carrier_file"
fi
fi
else
log "$lost_carrier_file does not exist, so will create it with $last_date_str"
echo $last_date_str > $lost_carrier_file
fi
else
dbg "No IB link flapping entry in syslog"
return 0
fi
}

Просмотреть файл

@ -1,57 +0,0 @@
#!/bin/bash
# Expected bandwidth > 180 Gbps
IB_WRITE_BW_EXE_PATH=/opt/perftest-4.5/ib_write_bw
IB_WRITE_BW=`basename $IB_WRITE_BW_EXE_PATH`
IB_WRITE_BW_DURATION=10
IB_WRITE_BW_ARGS="-s $(( 1 * 1024 * 1024 )) -D ${IB_WRITE_BW_DURATION} -x 0 -F --report_gbits"
SLEEP_TIME=5
HOSTNAME=`hostname`
#NUMA mapping for NDv4(A100)
GPU_NUMA=( 1 1 0 0 3 3 2 2 )
function check_ib_bw_gdr()
{
EXP_IB_BW=$1
for device in {0..3};
do
IB_WRITE_BW_OUT1=$(numactl -N ${GPU_NUMA[$device]} -m ${GPU_NUMA[$device]} $IB_WRITE_BW_EXE_PATH $IB_WRITE_BW_ARGS --use_cuda=${device} -d mlx5_ib${device} > /dev/null &)
IB_WRITE_BW_OUT1_RC=$?
if [[ $IB_WRITE_BW_OUT1_RC != 0 ]]; then
log "$IB_WRITE_BW_OUT1"
die 1 "$FUNCNAME: $IB_WRITE_BW returned error code $IB_WRITE_BW_OUT1_RC"
return 1
fi
sleep $SLEEP_TIME
device_peer=$(( device+4 ))
IB_WRITE_BW_OUT2=$(numactl -N ${GPU_NUMA[$device_peer]} -m ${GPU_NUMA[$device_peer]} $IB_WRITE_BW_EXE_PATH $IB_WRITE_BW_ARGS --use_cuda=${device_peer} -d mlx5_ib${device_peer} $HOSTNAME)
IB_WRITE_BW_OUT2_RC=$?
if [[ $IB_WRITE_BW_OUT2_RC != 0 ]]; then
log "$IB_WRITE_BW_OUT2"
die 1 "$FUNCNAME: $IB_WRITE_BW returned error code $IB_WRITE_BW_OUT2_RC"
return 1
fi
IFS=$'\n'
IB_WRITE_BW_OUT2_LINES=( $IB_WRITE_BW_OUT2 )
IFS=$' \t\n'
for ((i=0; i<${#IB_WRITE_BW_OUT2_LINES[*]}; i++))
do
if [[ "${IB_WRITE_BW_OUT2_LINES[$i]//1048576}" != "${IB_WRITE_BW_OUT2_LINES[$i]}" ]]; then
LINE=( ${IB_WRITE_BW_OUT2_LINES[$i]} )
ib_bandwidth=${LINE[3]}
dbg "IB devices=mlx5_ib${device}, mlx5_ib${device_peer}: numa domains=${GPU_NUMA[$device]},${GPU_NUMA[$device_peer]}, Measured IB BW $ib_bandwidth Gbps"
break
fi
done
if [[ $ib_bandwidth < $EXP_IB_BW ]]; then
log "$IB_WRITE_BW_OUT2"
die 1 "$FUNCNAME: $IB_WRITE_BW, IB=mlx5_ib${device}, mlx5_ib${device_peer}, IB BW (expected > $EXP_IB_BW Gbps, but measured $ib_bandwidth Gbps"
return 1
fi
done
return 0
}

Просмотреть файл

@ -1,48 +0,0 @@
#!/bin/bash
# Check for IB issues by running NCCL allreduce disabling NCCL shared memory.
# Expected performance is > 19 GB/s
MPI_ARGS="-np 8 --map-by ppr:8:node -bind-to numa -mca coll_hcoll_enable 0 --allow-run-as-root"
ENVIRON_VARS="-x LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x UCX_IB_PCI_RELAXED_ORDERING=on -x UCX_TLS=tcp -x UCX_NET_DEVICES=eth0 -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_SOCKET_IFNAME=eth0 -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml -x NCCL_SHM_DISABLE=1 -x NCCL_P2P_DISABLE=1"
NCCL_ARGS="-b 500M -f 2 -g 1 -e 1G -c 1"
function collect_nccl_allreduce_ib_loopback_data() {
nccl_allreduce_ib_loopback_out=$(source /etc/profile.d/modules.sh && module load mpi/hpcx && mpirun $MPI_ARGS $ENVIRON_VARS /opt/nccl-tests/build/all_reduce_perf $NCCL_ARGS)
nccl_allreduce_ib_loopback_out_rc=$?
if [[ $nccl_allreduce_ib_loopback_out_rc != 0 ]]; then
log "nccl_allreduce_ib_loopback_freq_out"
die 1 "$FUNCNAME: nccl_allreduce (IB loopback) returned error code $nccl_allreduce_ib_loopback_out_rc"
fi
IFS=$'\n'
nccl_allreduce_ib_loopback_out_lines=( $nccl_allreduce_ib_loopback_out )
IFS=$' \t\n'
}
function check_nccl_allreduce_ib_loopback() {
EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW=$1
collect_nccl_allreduce_ib_loopback_data
for ((i=0; i<${#nccl_allreduce_ib_loopback_out_lines[*]}; i++))
do
if [[ "${nccl_allreduce_ib_loopback_out_lines[$i]//bandwidth}" != "${nccl_allreduce_ib_loopback_out_lines[$i]}" ]]
then
IFS=$' \t\n'
nccl_allreduce_ib_loopback_out_line=( ${nccl_allreduce_ib_loopback_out_lines[$i]} )
avg_bus_bw=${nccl_allreduce_ib_loopback_out_line[5]}
dbg "Measured Avg NCCL allreduce ib loopback bus BW $avg_bus_bw GB/s"
break
fi
done
dbg "Measured Avg NCCL allreduce IB loopback bus BW=$avg_bus_bw, Expected NCCL allreduce IB loopback BW=$EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW"
if [[ $avg_bus_bw < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW ]]
then
log "$nccl_allreduce_ib_loopback_out"
die 1 "$FUNCNAME: NCCL allreduce IB loopback, BUS BW (expected > $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s, but measured $avg_bus_bw GB/s"
return 1
fi
}

Просмотреть файл

@ -1,67 +0,0 @@
# NHC - nVidia GPU Checks
#
# Johan Guldmyr <jguldmyr@csc.fi>
# 17 Dec 2015
#
NVIDIA_SMI_HEALTHMON="${NVIDIA_SMI_HEALTHMON:-nvidia-smi}"
NVIDIA_SMI_HEALTHMON_ARGS="${NVIDIA_SMI_HEALTHMON_ARGS}"
NVSMI_HEALTHMON_LINES=( )
NVSMI_HEALTHMON_OUTPUT=""
NVSMI_HEALTHMON_RC=""
export NVSMI_HEALTHMON_LINES NVSMI_HEALTHMON_OUTPUT NVSMI_HEALTHMON_RC
function nhc_nvsmi_gather_data() {
local IFS
NVSMI_HEALTHMON_OUTPUT=$($NVIDIA_SMI_HEALTHMON $NVIDIA_SMI_HEALTHMON_ARGS 2>/dev/null)
NVSMI_HEALTHMON_RC=$?
IFS=$'\n'
NVSMI_HEALTHMON_LINES=( $NVSMI_HEALTHMON_OUTPUT )
}
# Run the nvidia-smi utility and verify that all GPUs
# are functioning properly.
function check_nvsmi_healthmon() {
if [[ -z "$NVSMI_HEALTHMON_RC" ]]; then
nhc_nvsmi_gather_data
fi
if [[ $NVSMI_HEALTHMON_RC -eq 0 ]]; then
dbg "$FUNCNAME: $NVIDIA_SMI_HEALTHMON completed successfully"
return 0
elif [[ $NVSMI_HEALTHMON_RC -eq 4 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Permission denied"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 8 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Power cables not attached"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 2 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Invalid argument or flag"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 9 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: NVIDIA driver not loaded"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 10 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Interrupt issue with a GPU"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 12 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: NVML shared library could not be found"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 14 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: InfoROM is corrupted"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 15 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: The GPU has fallen off the bus or has otherwise become inaccessible"
return 1
elif [[ $NVSMI_HEALTHMON_RC -gt 127 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Caught fatal signal $((NVSMI_HEALTHMON_RC&0x7f))"
return 1
else
log "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: \"$NVSMI_HEALTHMON_OUTPUT\""
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Returned failure code $NVSMI_HEALTHMON_RC"
return 1
fi
}

2
specs/default/cluster-init/scripts/000_install_nhc.sh Normal file → Executable file
Просмотреть файл

@ -1,6 +1,4 @@
#!/bin/bash
set -x
set -e
chmod +x $CYCLECLOUD_SPEC_PATH/files/install_nhc.sh
$CYCLECLOUD_SPEC_PATH/files/install_nhc.sh

15
specs/default/cluster-init/scripts/001_configure_nhc.sh Normal file → Executable file
Просмотреть файл

@ -1,19 +1,4 @@
#!/bin/bash
set -x
#set -e
sudo -i
platform_family=$(jetpack config platform_family)
if [ $platform_family == "ubuntu" ]; then
apt install -y jq
fi
if [ $platform_family == "rhel" ]; then
yum install -y jq
fi
jetpack config healthchecks --json > $CYCLECLOUD_SPEC_PATH/files/healthchecks.json
chmod +x $CYCLECLOUD_SPEC_PATH/files/configure_nhc.sh
$CYCLECLOUD_SPEC_PATH/files/configure_nhc.sh

Просмотреть файл

@ -50,5 +50,5 @@ REPORT_PATH=$(jq -r '.report' ${HCHECK_SETTINGS_PATH})
APPLICATIONINSIGHTS_CONNECTION_STRING=$(jq -r '.appinsights.ConnectString' ${HCHECK_SETTINGS_PATH})
INSTRUMENTATION_KEY=$(jq -r '.appinsights.InstrumentationKey' ${HCHECK_SETTINGS_PATH})
$INSTALL_DIR/linux-x64/hcheck -k $INSTALL_DIR/reframe/azure_nhc/run_level_2 --append --rpath $REPORT_PATH --reframe $INSTALL_DIR/reframe/bin/reframe --config $INSTALL_DIR/reframe/azure_nhc/config/${reframe_cfg}
$INSTALL_DIR/linux-x64/hcheck --rpath $REPORT_PATH --fin --appin $INSTRUMENTATION_KEY --rscript $INSTALL_DIR/sbin/send_log
#$INSTALL_DIR/linux-x64/hcheck -k $INSTALL_DIR/reframe/azure_nhc/run_level_2 --append --rpath $REPORT_PATH --reframe $INSTALL_DIR/reframe/bin/reframe --config $INSTALL_DIR/reframe/azure_nhc/config/${reframe_cfg}
#$INSTALL_DIR/linux-x64/hcheck --rpath $REPORT_PATH --fin --appin $INSTRUMENTATION_KEY --rscript $INSTALL_DIR/sbin/send_log

Просмотреть файл

@ -192,6 +192,7 @@ Autoscale = $Autoscale
[[[configuration healthchecks.nhc]]]
config = $NHCConf
log = /var/log/nhc.log
timeout = $NHCTimeout
@ -580,7 +581,7 @@ Order = 20
[[[parameter UsePublicNetwork]]]
Label = Public Head Node
DefaultValue = true
DefaultValue = false
ParameterType = Boolean
Config.Label = Access scheduler node from the Internet
@ -599,6 +600,10 @@ Order = 20
Label = NHC Config Name
Description = The name of the configuration file used to perform NHC healthchecks
[[[parameter NHCTimeout]]]
Label = NHC Test Timeour
Description = The number of miliseconds it would take for NHC tests to time out
[[[parameter CustomScriptPattern]]]
Label = User Scripts Pattern
Description = The pattern used to detect custom tests