Added the packaging of cc_slurm_nhc tests and config files to github actions
This commit is contained in:
Родитель
da344b26a3
Коммит
5f9f88f481
|
@ -14,15 +14,21 @@ jobs:
|
|||
- name: Build project # This would actually build your project, using zip for an example artifact
|
||||
run: |
|
||||
cd ./hcheck/hcheck/
|
||||
dotnet build -r linux-x64 --self-contained
|
||||
dotnet build --use-current-runtime
|
||||
- name: Publish
|
||||
run: dotnet publish ./hcheck/hcheck/hcheck.csproj -c Release -o release -r linux-x64 --self-contained
|
||||
run: dotnet publish ./hcheck/hcheck/hcheck.csproj -c Release -o release --use-current-runtime
|
||||
- name: copy send_log file
|
||||
run: cp ./hcheck/hcheck/src/send_log /home/runner/work/cyclecloud-nodehealth/cyclecloud-nodehealth/hcheck/hcheck/bin/Release/net6.0/linux-x64/
|
||||
- name: Get the version
|
||||
id: get_version
|
||||
run:
|
||||
echo ::set-output name=VERSION::${GITHUB_REF#refs/tags/}
|
||||
- name: download nhc scripts
|
||||
run: |
|
||||
curl https://codeload.github.com/Azure/azurehpc/tar.gz/master |
|
||||
tar -xz --strip=7 azurehpc-master/experimental/cc_slurm_nhc/cc_slurm_nhc/specs/default/cluster-init
|
||||
cp ./files/*.conf /home/runner/work/cyclecloud-nodehealth/specs/default/cluster-init/files/nhc-config/
|
||||
cp ./files/*.nhc /home/runner/work/cyclecloud-nodehealth/specs/default/cluster-init/files/nhc-tests/
|
||||
- name: tar files
|
||||
run: |
|
||||
echo ${{ steps.get_version.outputs.version }}
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
#!/usr/bin/env bash
|
||||
SHARED_DIR_PATH=/shared/home/aevdokimova
|
||||
if [ -f $SHARED_DIR_PATH/"failed.txt" ];
|
||||
then
|
||||
rm $SHARED_DIR_PATH/failed.txt; exit 0;
|
||||
else
|
||||
echo "There was a hcheck error before" > $SHARED_DIR_PATH/failed.txt;
|
||||
echo "failed"; exit 1;
|
||||
fi
|
||||
|
||||
node_index=$(jetpack config cyclecloud.node.name | cut -d- -f5)
|
||||
if [[ $(expr $node_index % 2) == 0 ]]; then
|
||||
echo failed; exit 1;
|
||||
fi
|
|
@ -98,10 +98,12 @@ HCHECK_JSON=${HCHECK_FILES}${HCHECK_CONFIG}
|
|||
#NHC_CONF_FILE_NEW=${CYCLECLOUD_SPEC_PATH}/files/$(jq -r '.nhc.config' ${HCHECK_JSON})
|
||||
|
||||
NHC_CONF_NAME=$(jq -r '.nhc.config' ${HCHECK_JSON})
|
||||
NHC_TIMEOUT=$(jq -r '.nhc.timeout' ${HCHECK_JSON})
|
||||
|
||||
|
||||
if [[ $NHC_CONF_NAME == null ]]
|
||||
then
|
||||
$NHC_CONF_NAME=$(jetpack config azure.metadata.compute.vmSize).conf
|
||||
NHC_CONF_NAME=$(jetpack config azure.metadata.compute.vmSize).conf
|
||||
fi
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
#!/usr/bin/env bash
|
||||
SHARED_DIR_PATH=/shared/home/aevdokimova
|
||||
if [ -f $SHARED_DIR_PATH/"failed.txt" ];
|
||||
then
|
||||
rm $SHARED_DIR_PATH/failed.txt; exit 0;
|
||||
else
|
||||
echo "There was a hcheck error before" > $SHARED_DIR_PATH/failed.txt;
|
||||
echo "failed"; exit 1;
|
||||
fi
|
|
@ -52,7 +52,7 @@
|
|||
* || check_fs_mount_rw -t "devtmpfs" -s "devtmpfs" -f "/dev"
|
||||
* || check_fs_mount_rw -t "devpts" -s "devpts" -f "/dev/pts"
|
||||
* || check_fs_mount_rw -t "tmpfs" -s "tmpfs" -f "/run"
|
||||
* || check_fs_mount_rw -t "xfs" -s "/dev/sda2" -f "/"
|
||||
* || check_fs_mount_rw -t "xfs" -s "/dev/sdb2" -f "/"
|
||||
* || check_fs_mount_rw -t "securityfs" -s "securityfs" -f "/sys/kernel/security"
|
||||
* || check_fs_mount_rw -t "tmpfs" -s "tmpfs" -f "/dev/shm"
|
||||
* || check_fs_mount_ro -t "tmpfs" -s "tmpfs" -f "/sys/fs/cgroup"
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Expected bandwidth > 22GB/s
|
||||
BANDWIDTHTEST_EXE_PATH=/usr/local/cuda/samples/1_Utilities/bandwidthTest/bandwidthTest
|
||||
BANDWIDTHTEST=`basename $BANDWIDTHTEST_EXE_PATH`
|
||||
|
||||
#NUMA mapping for NDv4(A100)
|
||||
GPU_NUMA=( 1 1 0 0 3 3 2 2 )
|
||||
|
||||
|
||||
function check_cuda_bw()
|
||||
{
|
||||
EXP_CUDA_BW=$1
|
||||
for test in "--dtoh" "--htod"
|
||||
do
|
||||
for device in {0..7};
|
||||
do
|
||||
IFS=$'\n'
|
||||
CUDA_BW=$(numactl -N ${GPU_NUMA[$device]} -m ${GPU_NUMA[$device]} $BANDWIDTHTEST_EXE_PATH --device=$device $test)
|
||||
CUDA_BW_RC=$?
|
||||
if [[ $CUDA_BW_RC != 0 ]]
|
||||
then
|
||||
log "$CUDA_BW"
|
||||
die 1 "$FUNCNAME: $BANDWIDTHTEST retuned error code $CUDA_BW_RC "
|
||||
return 1
|
||||
fi
|
||||
CUDA_BW_LINES=( $CUDA_BW )
|
||||
for ((i=0; i<${#CUDA_BW_LINES[*]}; i++))
|
||||
do
|
||||
if [[ "${CUDA_BW_LINES[$i]//32000000}" != "${CUDA_BW_LINES[$i]}" ]]
|
||||
then
|
||||
IFS=$' \t\n'
|
||||
LINE=( ${CUDA_BW_LINES[$i]} )
|
||||
cuda_bandwidth=${LINE[1]}
|
||||
dbg "gpu id=$device: numa domain=${GPU_NUMA[$device]}, Measured CUDA BW $cuda_bandwidth GB/s"
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [[ $cuda_bandwidth < $EXP_CUDA_BW ]]
|
||||
then
|
||||
log "$CUDA_BW"
|
||||
die 1 "$FUNCNAME: $BANDWIDTHTEST, gpu=$device, CUDA BW $test (expected > $EXP_CUDA_BW GB/s, but measured $cuda_bandwidth GB/s"
|
||||
return 1
|
||||
fi
|
||||
done
|
||||
done
|
||||
IFS=$' \t\n'
|
||||
return 0
|
||||
}
|
|
@ -1,47 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Check if application GPU clock frequencies are set to their maximum values, if not will attempt to set them.
|
||||
|
||||
GPU_QUERY_CLOCKS="clocks.max.memory,clocks.applications.memory,clocks.max.graphics,clocks.applications.graphics"
|
||||
|
||||
|
||||
function collect_clocks_data() {
|
||||
|
||||
gpu_freq_out=$(nvidia-smi --query-gpu=$GPU_QUERY_CLOCKS --format=csv,noheader,nounits)
|
||||
gpu_freq_out_rc=$?
|
||||
if [[ $gpu_freq_out_rc != 0 ]]; then
|
||||
log "$gpu_freq_out"
|
||||
die 1 "$FUNCNAME: nvidia-smi (get clock freqs) returned error code $gpu_freq_out_rc"
|
||||
fi
|
||||
IFS=$'\n'
|
||||
gpu_freq_out_lines=( $gpu_freq_out )
|
||||
IFS=$' \t\n'
|
||||
}
|
||||
|
||||
|
||||
function check_app_gpu_clocks() {
|
||||
|
||||
collect_clocks_data
|
||||
|
||||
for ((i=0; i<${#gpu_freq_out_lines[*]}; i++))
|
||||
do
|
||||
IFS=$', '
|
||||
gpu_freq_out_line=( ${gpu_freq_out_lines[$i]} )
|
||||
IFS=$' \t\n'
|
||||
if [[ ${gpu_freq_out_line[0]} -gt ${gpu_freq_out_line[1]} || ${gpu_freq_out_line[2]} -gt ${gpu_freq_out_line[3]} ]]; then
|
||||
log "Warning: GPU Id $i: GPU memory freq (max,current)= (${gpu_freq_out_line[0]},${gpu_freq_out_line[1]}) MHz, GPU graphics freq (max,current) = (${gpu_freq_out_line[2]},${gpu_freq_out_line[3]}) MHz"
|
||||
log "Attempting to set application GPU clock frequencies to maximum frequencies"
|
||||
set_gpu_freq_out=$(nvidia-smi -i $i -ac ${gpu_freq_out_line[0]},${gpu_freq_out_line[2]})
|
||||
set_gpu_freq_out_rc=$?
|
||||
if [[ $set_gpu_freq_out_rc != 0 ]]; then
|
||||
log "$set_gpu_freq_out"
|
||||
die 1 "$FUNCNAME: nvidia-smi (set gpu max clock freqs) returned error code $set_gpu_freq_out_rc"
|
||||
fi
|
||||
log "On GPU Id $i: $set_gpu_freq_out"
|
||||
return 0
|
||||
else
|
||||
dbg "GPU Id $i: max application GPU clocks are already set, GPU memory is ${gpu_freq_out_line[0]} MHz and GPU graphics is ${gpu_freq_out_line[2]} MHz"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
GPU_THROTTLE_QUERY="clocks_throttle_reasons.active"
|
||||
|
||||
GPU_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN=0x0000000000000008
|
||||
GPU_CLOCKS_THROTTLE_REASON_HW_THERMAL_SLOWDOWN=0x000000000000004
|
||||
GPU_CLOCKS_THROTTLE_REASON_APPLICATIONS_CLOCK_SETTINGS=0x0000000000000002
|
||||
GPU_CLOCKS_THROTTLE_READON_DISPLAY_SETTINGS=0x0000000000000100
|
||||
GPU_CLOCKS_THROTTLE_REASON_GPU_IDLE=0x0000000000000001
|
||||
GPU_CLOCKS_THROTTLE_REASON_POWER_BRAKE_SLOWDOWN=0x0000000000000080
|
||||
GPU_CLOCKS_THROTTLE_REASON_NONE=0x0000000000000000
|
||||
GPU_CLOCKS_THROTTLE_REASON_SW_POWER_CAP=0x0000000000000004
|
||||
GPU_CLOCKS_THROTTLE_REASON_SW_THERMAL_SLOWDOWN=0x0000000000000020
|
||||
GPU_CLOCKS_THROTTLE_REASON_SYNC_BOOST=0x0000000000000010
|
||||
|
||||
|
||||
function collect_gpu_clock_throttle_data() {
|
||||
gpu_clock_throttle_query_out=$(nvidia-smi --query-gpu=$GPU_THROTTLE_QUERY --format=csv,noheader,nounits)
|
||||
gpu_clock_throttle_query_rc=$?
|
||||
if [[ $gpu_clock_throttle_query_rc != 0 ]]; then
|
||||
log "$gpu_clock_throttle_query_out"
|
||||
die 1 "$FUNCNAME: nvidia-smi (get gpu clock throttle data) returned error code $gpu_clock_throttle_query_rc"
|
||||
fi
|
||||
dbg "gpu_clock_throttle_query_out=$gpu_clock_throttle_query_out"
|
||||
IFS=$'\n'
|
||||
gpu_clock_throttle_out_lines=( $gpu_clock_throttle_query_out )
|
||||
IFS=$' \t\n'
|
||||
}
|
||||
|
||||
function check_gpu_clock_throttling() {
|
||||
collect_gpu_clock_throttle_data
|
||||
for ((i=0; i<${#gpu_clock_throttle_out_lines[*]}; i++))
|
||||
do
|
||||
IFS=$', '
|
||||
gpu_clock_throttle_out_line=( ${gpu_clock_throttle_out_lines[$i]} )
|
||||
IFS=$' \t\n'
|
||||
if [[ ${gpu_clock_throttle_out_line[0]} != $GPU_CLOCKS_THROTTLE_REASON_GPU_IDLE && ${gpu_clock_throttle_out_line[0]} != $GPU_CLOCKS_THROTTLE_REASON_NONE && ${gpu_clock_throttle_out_line[0]} != $GPU_CLOCKS_THROTTLE_REASON_SW_POWER_CAP ]]; then
|
||||
log "Warning: GPU $i throttled, reason=${gpu_clock_throttle_out_line[0]}"
|
||||
# Just log GPU throttling (but do not DRAIN node)
|
||||
# die 1 "$FUNCNAME: GPU $i clock throttled, reason=${gpu_clock_throttle_out_line[0]}"
|
||||
fi
|
||||
done
|
||||
}
|
|
@ -1,68 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Check for GPU ECC errors
|
||||
|
||||
GPU_REMAPPED_ROWS_QUERY="remapped_rows.pending,remapped_rows.failure"
|
||||
GPU_QUERY="ecc.errors.uncorrected.volatile.sram,ecc.errors.uncorrected.aggregate.sram,ecc.errors.uncorrected.volatile.dram,ecc.errors.uncorrected.aggregate.dram"
|
||||
|
||||
|
||||
function collect_ecc_data() {
|
||||
|
||||
gpu_query_out=$(nvidia-smi --query-gpu=$GPU_QUERY --format=csv,noheader)
|
||||
gpu_query_out_rc=$?
|
||||
if [[ $gpu_query_out_rc != 0 ]]
|
||||
then
|
||||
log "$gpu_query_out"
|
||||
die 1 "$FUNCNAME: nvidia-smi (get gpu uncorrected counts) returned error code $gpu_query_out_rc"
|
||||
fi
|
||||
gpu_remapped_rows_out=$(nvidia-smi --query-remapped-rows=$GPU_REMAPPED_ROWS_QUERY --format=csv,noheader)
|
||||
gpu_remapped_rows_out_rc=$?
|
||||
if [[ $gpu_remapped_rows_out_rc != 0 ]]
|
||||
then
|
||||
log "$gpu_remaped_rows_out"
|
||||
die 1 "$FUNCNAME: nvidia-smi (get gpu remapped rows) returned error code $gpu_freq_out_rc"
|
||||
fi
|
||||
IFS=$'\n'
|
||||
gpu_query_out_lines=( $gpu_query_out )
|
||||
gpu_remapped_rows_query_out_lines=( $gpu_remapped_rows_out )
|
||||
IFS=$' \t\n'
|
||||
}
|
||||
|
||||
|
||||
function check_gpu_ecc() {
|
||||
|
||||
collect_ecc_data
|
||||
|
||||
if [[ ${#gpu_query_out_lines[*]} != ${#gpu_remapped_rows_query_out_lines[*]} ]]; then
|
||||
die 1 "$FUNCNAME: nvidia-smi (Number GPU's not correct), (${#gpu_query_out_lines[*]},${#gpu_remapped_rows_query_out_lines[*]})"
|
||||
fi
|
||||
for ((i=0; i<${#gpu_remapped_rows_query_out_lines[*]}; i++))
|
||||
do
|
||||
IFS=$', '
|
||||
gpu_remapped_rows_query_out_line=( ${gpu_remapped_rows_query_out_lines[$i]} )
|
||||
gpu_query_out_line=( ${gpu_query_out_lines[$i]} )
|
||||
IFS=$' \t\n'
|
||||
if [[ ${gpu_remapped_rows_query_out_line[0]} > 0 ]]
|
||||
then
|
||||
die 1 "$FUNCNAME: GPU id $i: Row remap pending"
|
||||
fi
|
||||
if [[ ${gpu_remapped_rows_query_out_line[1]} > 0 ]]
|
||||
then
|
||||
die 1 "$FUNCNAME: GPU id $i: Row remap error"
|
||||
fi
|
||||
dbg "GPU id $i: No GPU row remap pending or row remap errors"
|
||||
if [[ ${gpu_query_out_line[0]} -gt 0 || ${gpu_query_out_line[1]} -gt 0 ]]; then
|
||||
die 1 "$FUNCNAME: GPU id $i: SRAM Uncorrected ECC error count, (${gpu_query_out_line[0]},${gpu_query_out_line[1]})"
|
||||
else
|
||||
dbg "GPU id $i: Normal SRAM Uncorrectable ECC error count, (${gpu_query_out_line[0]},${gpu_query_out_line[1]})"
|
||||
fi
|
||||
if [[ -n $1 ]]; then
|
||||
if [[ ${gpu_query_out_line[2]} -gt $1 || ${gpu_query_out_line[3]} -gt $1 ]]; then
|
||||
die 1 "$FUNCNAME: GPU id $i: High DRAM Uncorrected ECC error count, (${gpu_query_out_line[2]},${gpu_query_out_line[3]})"
|
||||
else
|
||||
dbg "GPU id $i: Normal DRAM Uncorrectable ECC error count, (${gpu_query_out_line[2]},${gpu_query_out_line[3]})"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
return 0
|
||||
}
|
|
@ -1,43 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Check GPU persistence mode, if not enabled, attempt to enable.
|
||||
|
||||
PERSISTENCE_GPU_QUERY="persistence_mode"
|
||||
|
||||
|
||||
function collect_persistence_data() {
|
||||
|
||||
gpu_query_out=$(nvidia-smi --query-gpu=$PERSISTENCE_GPU_QUERY --format=csv,noheader)
|
||||
gpu_query_out_rc=$?
|
||||
if [[ $gpu_query_out_rc != 0 ]]; then
|
||||
log "$gpu_query_out"
|
||||
die 1 "$FUNCNAME: nvidia-smi (persistence mode) returned error code $gpu_query_out_rc"
|
||||
fi
|
||||
IFS=$'\n'
|
||||
gpu_query_out_lines=( $gpu_query_out )
|
||||
IFS=$' \t\n'
|
||||
}
|
||||
|
||||
|
||||
function check_gpu_persistence() {
|
||||
|
||||
collect_persistence_data
|
||||
|
||||
for ((i=0; i<${#gpu_query_out_lines[*]}; i++))
|
||||
do
|
||||
if [[ ${gpu_query_out_lines[$i]} == Disabled ]]; then
|
||||
dbg "$FUNCNAME: GPU id $i: Persistence mode is disabled, will attempt to enable"
|
||||
gpu_persistence_out=$(nvidia-smi -i $i -pm 1)
|
||||
gpu_persistence_out_rc=$?
|
||||
if [[ $gpu_query_out_rc != 0 ]]; then
|
||||
log "$gpu_persistence_out"
|
||||
die 1 "$FUNCNAME: nvidia-smi (enable persistence mode) returned error code $gpu_persistence_out_rc"
|
||||
else
|
||||
dbg "$gpu_persistence_out"
|
||||
fi
|
||||
else
|
||||
dbg "$FUNCNAME: GPU id $i: Persistence mode is already enabled"
|
||||
fi
|
||||
done
|
||||
return 0
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#expect to not have any IB link flaps within a given time interval (in hours)
|
||||
IB_FLAPPING_LINK_TEST="IB link flapping detected"
|
||||
|
||||
|
||||
function check_ib_link_flapping()
|
||||
{
|
||||
TIME_INTERVAL_HOURS=$1
|
||||
lost_carrier_file=/tmp/last_lost_carrier_date
|
||||
lost_carrier_line=$(grep -i "ib.*lost carrier" /var/log/syslog | tail -n 1)
|
||||
|
||||
if [ "$lost_carrier_line" != "" ]; then
|
||||
dbg "IB link flapping entry in syslog, $lost_carrier_line"
|
||||
lost_carrier_array=( $lost_carrier_line )
|
||||
last_date_str="${lost_carrier_array[0]} ${lost_carrier_array[1]} ${lost_carrier_array[2]}"
|
||||
last_date_sec=$(date --date "$last_date_str" +%s)
|
||||
dbg "last_date_sec = $last_date_sec"
|
||||
|
||||
if [ -f $lost_carrier_file ]; then
|
||||
log "File $lost_carrier_file exists"
|
||||
previous_stored_date=$(cat $lost_carrier_file)
|
||||
dbg "File $lost_carrier_file contains, $previous_stored_date"
|
||||
|
||||
if [ "$last_date_str" != "$previous_stored_date" ]; then
|
||||
previous_stored_date_sec=$(date --date "$previous_stored_date" +%s)
|
||||
dbg "previous_stored_date_sec=$previous_stored_date_sec"
|
||||
((diff_secs=$last_date_sec-$previous_stored_date_sec))
|
||||
dbg "IB link flap time interval= $diff_sec sec"
|
||||
((diff_hours=$diff/(60*60)))
|
||||
dbg "IB link flap time interval= $diff_hours hours"
|
||||
|
||||
if [ $diff_hours -lt $TIME_INTERVAL_HOURS ]; then
|
||||
log "$IB_FLAPPING_LINK_TEST, multiple IB link flapping events within $TIME_INTERVAL_HOURS hours($previous_stored_date, $last_date_str)"
|
||||
die 1 "$FUNCNAME: $IB_FLAPPING_LINK_TEST, multiple IB link flapping events within $TIME_INTERVAL_HOURS hours"
|
||||
else
|
||||
rm $lost_carrier_file
|
||||
log "Time interval > $TIME_INTERVAL_HOURS, Remove $lost_carrier_file"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
log "$lost_carrier_file does not exist, so will create it with $last_date_str"
|
||||
echo $last_date_str > $lost_carrier_file
|
||||
fi
|
||||
else
|
||||
dbg "No IB link flapping entry in syslog"
|
||||
return 0
|
||||
fi
|
||||
}
|
|
@ -1,57 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Expected bandwidth > 180 Gbps
|
||||
|
||||
IB_WRITE_BW_EXE_PATH=/opt/perftest-4.5/ib_write_bw
|
||||
IB_WRITE_BW=`basename $IB_WRITE_BW_EXE_PATH`
|
||||
IB_WRITE_BW_DURATION=10
|
||||
IB_WRITE_BW_ARGS="-s $(( 1 * 1024 * 1024 )) -D ${IB_WRITE_BW_DURATION} -x 0 -F --report_gbits"
|
||||
SLEEP_TIME=5
|
||||
|
||||
HOSTNAME=`hostname`
|
||||
#NUMA mapping for NDv4(A100)
|
||||
GPU_NUMA=( 1 1 0 0 3 3 2 2 )
|
||||
|
||||
|
||||
|
||||
function check_ib_bw_gdr()
|
||||
{
|
||||
EXP_IB_BW=$1
|
||||
for device in {0..3};
|
||||
do
|
||||
IB_WRITE_BW_OUT1=$(numactl -N ${GPU_NUMA[$device]} -m ${GPU_NUMA[$device]} $IB_WRITE_BW_EXE_PATH $IB_WRITE_BW_ARGS --use_cuda=${device} -d mlx5_ib${device} > /dev/null &)
|
||||
IB_WRITE_BW_OUT1_RC=$?
|
||||
if [[ $IB_WRITE_BW_OUT1_RC != 0 ]]; then
|
||||
log "$IB_WRITE_BW_OUT1"
|
||||
die 1 "$FUNCNAME: $IB_WRITE_BW returned error code $IB_WRITE_BW_OUT1_RC"
|
||||
return 1
|
||||
fi
|
||||
sleep $SLEEP_TIME
|
||||
device_peer=$(( device+4 ))
|
||||
IB_WRITE_BW_OUT2=$(numactl -N ${GPU_NUMA[$device_peer]} -m ${GPU_NUMA[$device_peer]} $IB_WRITE_BW_EXE_PATH $IB_WRITE_BW_ARGS --use_cuda=${device_peer} -d mlx5_ib${device_peer} $HOSTNAME)
|
||||
IB_WRITE_BW_OUT2_RC=$?
|
||||
if [[ $IB_WRITE_BW_OUT2_RC != 0 ]]; then
|
||||
log "$IB_WRITE_BW_OUT2"
|
||||
die 1 "$FUNCNAME: $IB_WRITE_BW returned error code $IB_WRITE_BW_OUT2_RC"
|
||||
return 1
|
||||
fi
|
||||
IFS=$'\n'
|
||||
IB_WRITE_BW_OUT2_LINES=( $IB_WRITE_BW_OUT2 )
|
||||
IFS=$' \t\n'
|
||||
for ((i=0; i<${#IB_WRITE_BW_OUT2_LINES[*]}; i++))
|
||||
do
|
||||
if [[ "${IB_WRITE_BW_OUT2_LINES[$i]//1048576}" != "${IB_WRITE_BW_OUT2_LINES[$i]}" ]]; then
|
||||
LINE=( ${IB_WRITE_BW_OUT2_LINES[$i]} )
|
||||
ib_bandwidth=${LINE[3]}
|
||||
dbg "IB devices=mlx5_ib${device}, mlx5_ib${device_peer}: numa domains=${GPU_NUMA[$device]},${GPU_NUMA[$device_peer]}, Measured IB BW $ib_bandwidth Gbps"
|
||||
break
|
||||
fi
|
||||
done
|
||||
if [[ $ib_bandwidth < $EXP_IB_BW ]]; then
|
||||
log "$IB_WRITE_BW_OUT2"
|
||||
die 1 "$FUNCNAME: $IB_WRITE_BW, IB=mlx5_ib${device}, mlx5_ib${device_peer}, IB BW (expected > $EXP_IB_BW Gbps, but measured $ib_bandwidth Gbps"
|
||||
return 1
|
||||
fi
|
||||
done
|
||||
return 0
|
||||
}
|
|
@ -1,48 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Check for IB issues by running NCCL allreduce disabling NCCL shared memory.
|
||||
# Expected performance is > 19 GB/s
|
||||
|
||||
MPI_ARGS="-np 8 --map-by ppr:8:node -bind-to numa -mca coll_hcoll_enable 0 --allow-run-as-root"
|
||||
ENVIRON_VARS="-x LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x UCX_IB_PCI_RELAXED_ORDERING=on -x UCX_TLS=tcp -x UCX_NET_DEVICES=eth0 -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_SOCKET_IFNAME=eth0 -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml -x NCCL_SHM_DISABLE=1 -x NCCL_P2P_DISABLE=1"
|
||||
NCCL_ARGS="-b 500M -f 2 -g 1 -e 1G -c 1"
|
||||
|
||||
|
||||
function collect_nccl_allreduce_ib_loopback_data() {
|
||||
|
||||
nccl_allreduce_ib_loopback_out=$(source /etc/profile.d/modules.sh && module load mpi/hpcx && mpirun $MPI_ARGS $ENVIRON_VARS /opt/nccl-tests/build/all_reduce_perf $NCCL_ARGS)
|
||||
nccl_allreduce_ib_loopback_out_rc=$?
|
||||
if [[ $nccl_allreduce_ib_loopback_out_rc != 0 ]]; then
|
||||
log "nccl_allreduce_ib_loopback_freq_out"
|
||||
die 1 "$FUNCNAME: nccl_allreduce (IB loopback) returned error code $nccl_allreduce_ib_loopback_out_rc"
|
||||
fi
|
||||
IFS=$'\n'
|
||||
nccl_allreduce_ib_loopback_out_lines=( $nccl_allreduce_ib_loopback_out )
|
||||
IFS=$' \t\n'
|
||||
}
|
||||
|
||||
|
||||
function check_nccl_allreduce_ib_loopback() {
|
||||
|
||||
EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW=$1
|
||||
collect_nccl_allreduce_ib_loopback_data
|
||||
|
||||
for ((i=0; i<${#nccl_allreduce_ib_loopback_out_lines[*]}; i++))
|
||||
do
|
||||
if [[ "${nccl_allreduce_ib_loopback_out_lines[$i]//bandwidth}" != "${nccl_allreduce_ib_loopback_out_lines[$i]}" ]]
|
||||
then
|
||||
IFS=$' \t\n'
|
||||
nccl_allreduce_ib_loopback_out_line=( ${nccl_allreduce_ib_loopback_out_lines[$i]} )
|
||||
avg_bus_bw=${nccl_allreduce_ib_loopback_out_line[5]}
|
||||
dbg "Measured Avg NCCL allreduce ib loopback bus BW $avg_bus_bw GB/s"
|
||||
break
|
||||
fi
|
||||
done
|
||||
dbg "Measured Avg NCCL allreduce IB loopback bus BW=$avg_bus_bw, Expected NCCL allreduce IB loopback BW=$EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW"
|
||||
if [[ $avg_bus_bw < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW ]]
|
||||
then
|
||||
log "$nccl_allreduce_ib_loopback_out"
|
||||
die 1 "$FUNCNAME: NCCL allreduce IB loopback, BUS BW (expected > $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s, but measured $avg_bus_bw GB/s"
|
||||
return 1
|
||||
fi
|
||||
}
|
|
@ -1,67 +0,0 @@
|
|||
# NHC - nVidia GPU Checks
|
||||
#
|
||||
# Johan Guldmyr <jguldmyr@csc.fi>
|
||||
# 17 Dec 2015
|
||||
#
|
||||
|
||||
NVIDIA_SMI_HEALTHMON="${NVIDIA_SMI_HEALTHMON:-nvidia-smi}"
|
||||
NVIDIA_SMI_HEALTHMON_ARGS="${NVIDIA_SMI_HEALTHMON_ARGS}"
|
||||
|
||||
NVSMI_HEALTHMON_LINES=( )
|
||||
NVSMI_HEALTHMON_OUTPUT=""
|
||||
NVSMI_HEALTHMON_RC=""
|
||||
|
||||
export NVSMI_HEALTHMON_LINES NVSMI_HEALTHMON_OUTPUT NVSMI_HEALTHMON_RC
|
||||
|
||||
function nhc_nvsmi_gather_data() {
|
||||
local IFS
|
||||
|
||||
NVSMI_HEALTHMON_OUTPUT=$($NVIDIA_SMI_HEALTHMON $NVIDIA_SMI_HEALTHMON_ARGS 2>/dev/null)
|
||||
NVSMI_HEALTHMON_RC=$?
|
||||
IFS=$'\n'
|
||||
NVSMI_HEALTHMON_LINES=( $NVSMI_HEALTHMON_OUTPUT )
|
||||
}
|
||||
|
||||
# Run the nvidia-smi utility and verify that all GPUs
|
||||
# are functioning properly.
|
||||
function check_nvsmi_healthmon() {
|
||||
if [[ -z "$NVSMI_HEALTHMON_RC" ]]; then
|
||||
nhc_nvsmi_gather_data
|
||||
fi
|
||||
|
||||
if [[ $NVSMI_HEALTHMON_RC -eq 0 ]]; then
|
||||
dbg "$FUNCNAME: $NVIDIA_SMI_HEALTHMON completed successfully"
|
||||
return 0
|
||||
elif [[ $NVSMI_HEALTHMON_RC -eq 4 ]]; then
|
||||
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Permission denied"
|
||||
return 1
|
||||
elif [[ $NVSMI_HEALTHMON_RC -eq 8 ]]; then
|
||||
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Power cables not attached"
|
||||
return 1
|
||||
elif [[ $NVSMI_HEALTHMON_RC -eq 2 ]]; then
|
||||
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Invalid argument or flag"
|
||||
return 1
|
||||
elif [[ $NVSMI_HEALTHMON_RC -eq 9 ]]; then
|
||||
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: NVIDIA driver not loaded"
|
||||
return 1
|
||||
elif [[ $NVSMI_HEALTHMON_RC -eq 10 ]]; then
|
||||
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Interrupt issue with a GPU"
|
||||
return 1
|
||||
elif [[ $NVSMI_HEALTHMON_RC -eq 12 ]]; then
|
||||
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: NVML shared library could not be found"
|
||||
return 1
|
||||
elif [[ $NVSMI_HEALTHMON_RC -eq 14 ]]; then
|
||||
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: InfoROM is corrupted"
|
||||
return 1
|
||||
elif [[ $NVSMI_HEALTHMON_RC -eq 15 ]]; then
|
||||
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: The GPU has fallen off the bus or has otherwise become inaccessible"
|
||||
return 1
|
||||
elif [[ $NVSMI_HEALTHMON_RC -gt 127 ]]; then
|
||||
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Caught fatal signal $((NVSMI_HEALTHMON_RC&0x7f))"
|
||||
return 1
|
||||
else
|
||||
log "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: \"$NVSMI_HEALTHMON_OUTPUT\""
|
||||
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Returned failure code $NVSMI_HEALTHMON_RC"
|
||||
return 1
|
||||
fi
|
||||
}
|
|
@ -1,6 +1,4 @@
|
|||
#!/bin/bash
|
||||
set -x
|
||||
set -e
|
||||
|
||||
chmod +x $CYCLECLOUD_SPEC_PATH/files/install_nhc.sh
|
||||
$CYCLECLOUD_SPEC_PATH/files/install_nhc.sh
|
||||
|
|
|
@ -1,19 +1,4 @@
|
|||
#!/bin/bash
|
||||
set -x
|
||||
#set -e
|
||||
|
||||
sudo -i
|
||||
|
||||
platform_family=$(jetpack config platform_family)
|
||||
|
||||
if [ $platform_family == "ubuntu" ]; then
|
||||
apt install -y jq
|
||||
fi
|
||||
|
||||
if [ $platform_family == "rhel" ]; then
|
||||
yum install -y jq
|
||||
fi
|
||||
|
||||
jetpack config healthchecks --json > $CYCLECLOUD_SPEC_PATH/files/healthchecks.json
|
||||
chmod +x $CYCLECLOUD_SPEC_PATH/files/configure_nhc.sh
|
||||
$CYCLECLOUD_SPEC_PATH/files/configure_nhc.sh
|
||||
|
|
|
@ -50,5 +50,5 @@ REPORT_PATH=$(jq -r '.report' ${HCHECK_SETTINGS_PATH})
|
|||
|
||||
APPLICATIONINSIGHTS_CONNECTION_STRING=$(jq -r '.appinsights.ConnectString' ${HCHECK_SETTINGS_PATH})
|
||||
INSTRUMENTATION_KEY=$(jq -r '.appinsights.InstrumentationKey' ${HCHECK_SETTINGS_PATH})
|
||||
$INSTALL_DIR/linux-x64/hcheck -k $INSTALL_DIR/reframe/azure_nhc/run_level_2 --append --rpath $REPORT_PATH --reframe $INSTALL_DIR/reframe/bin/reframe --config $INSTALL_DIR/reframe/azure_nhc/config/${reframe_cfg}
|
||||
$INSTALL_DIR/linux-x64/hcheck --rpath $REPORT_PATH --fin --appin $INSTRUMENTATION_KEY --rscript $INSTALL_DIR/sbin/send_log
|
||||
#$INSTALL_DIR/linux-x64/hcheck -k $INSTALL_DIR/reframe/azure_nhc/run_level_2 --append --rpath $REPORT_PATH --reframe $INSTALL_DIR/reframe/bin/reframe --config $INSTALL_DIR/reframe/azure_nhc/config/${reframe_cfg}
|
||||
#$INSTALL_DIR/linux-x64/hcheck --rpath $REPORT_PATH --fin --appin $INSTRUMENTATION_KEY --rscript $INSTALL_DIR/sbin/send_log
|
|
@ -192,6 +192,7 @@ Autoscale = $Autoscale
|
|||
[[[configuration healthchecks.nhc]]]
|
||||
config = $NHCConf
|
||||
log = /var/log/nhc.log
|
||||
timeout = $NHCTimeout
|
||||
|
||||
|
||||
|
||||
|
@ -580,7 +581,7 @@ Order = 20
|
|||
|
||||
[[[parameter UsePublicNetwork]]]
|
||||
Label = Public Head Node
|
||||
DefaultValue = true
|
||||
DefaultValue = false
|
||||
ParameterType = Boolean
|
||||
Config.Label = Access scheduler node from the Internet
|
||||
|
||||
|
@ -599,6 +600,10 @@ Order = 20
|
|||
Label = NHC Config Name
|
||||
Description = The name of the configuration file used to perform NHC healthchecks
|
||||
|
||||
[[[parameter NHCTimeout]]]
|
||||
Label = NHC Test Timeour
|
||||
Description = The number of miliseconds it would take for NHC tests to time out
|
||||
|
||||
[[[parameter CustomScriptPattern]]]
|
||||
Label = User Scripts Pattern
|
||||
Description = The pattern used to detect custom tests
|
||||
|
|
Загрузка…
Ссылка в новой задаче