diag mpi pingpong: check HpcVmDrivers on RDMA Windows nodes

This commit is contained in:
FAREAST\chezhang 2019-01-29 04:51:40 +08:00
Родитель c47d00c788
Коммит d545526bb2
1 изменённых файлов: 19 добавлений и 5 удалений

Просмотреть файл

@ -367,6 +367,7 @@ def mpiPingpongCreateTasksWindows(nodelist, isRdma, startId, mpiLocation, log, u
if len(nodelist) == 0: if len(nodelist) == 0:
return tasks return tasks
hpcVmDriversExtensionPath = r'C:\Packages\Plugins\Microsoft.HpcCompute.HpcVmDrivers'
sampleOption = '-msglog {}:{}'.format(log, log + 1) if -1 < log < 30 else '-iter 10' sampleOption = '-msglog {}:{}'.format(log, log + 1) if -1 < log < 30 else '-iter 10'
rdmaOption = '' rdmaOption = ''
taskLabel = '[Windows]' taskLabel = '[Windows]'
@ -390,13 +391,15 @@ def mpiPingpongCreateTasksWindows(nodelist, isRdma, startId, mpiLocation, log, u
commandStartSmpd = 'type nul >smpdstarted && smpd -d || del smpdstarted' commandStartSmpd = 'type nul >smpdstarted && smpd -d || del smpdstarted'
commandStopSmpd = 'if exist smpdstarted taskkill /f /im smpd.exe' commandStopSmpd = 'if exist smpdstarted taskkill /f /im smpd.exe'
commandCheckHost = 'nslookup [nodepong] 2>&1 | findstr /C:"can\'t find [nodepong]"' commandCheckHost = 'nslookup [nodepong] 2>&1 | findstr /C:"can\'t find [nodepong]"'
commandCheckMpi = 'not exist "%MSMPI_BIN%" (echo Microsoft MPI is not installed)' commandCheckMpi = 'if not exist "%MSMPI_BIN%" (echo Microsoft MPI is not installed)'
commandCheckSmpd = 'tasklist /fi "imagename eq smpd.exe" | findstr smpd' commandCheckSmpd = 'tasklist /fi "imagename eq smpd.exe" | findstr smpd'
commandCheckRdma = 'if not exist {} (echo HpcVmDrivers is not installed)'.format(hpcVmDriversExtensionPath)
commandCheckRdmaAndMpi = '{} else {}'.format(commandCheckRdma, commandCheckMpi)
commandSetEnvs = "$env:CCP_TASKCONTEXT=''; $env:path='%MSMPI_BIN%'" commandSetEnvs = "$env:CCP_TASKCONTEXT=''; $env:path='%MSMPI_BIN%'"
commandMpiIntra = "{}; mpiexec -hosts 1 %COMPUTERNAME% 2 '%MSMPI_BENCHMARKS%IMB-MPI1' {} pingpong".format(commandSetEnvs, sampleOption) commandMpiIntra = "{}; mpiexec -hosts 1 %COMPUTERNAME% 2 '%MSMPI_BENCHMARKS%IMB-MPI1' {} pingpong".format(commandSetEnvs, sampleOption)
commandMpiInter = "{}; mpiexec -hosts 2 [nodeping] 1 [nodepong] 1 '%MSMPI_BENCHMARKS%IMB-MPI1' -time 60 {} pingpong".format(commandSetEnvs, sampleOption) commandMpiInter = "{}; mpiexec -hosts 2 [nodeping] 1 [nodepong] 1 '%MSMPI_BENCHMARKS%IMB-MPI1' -time 60 {} pingpong".format(commandSetEnvs, sampleOption)
commandRunIntra = 'if {} else echo off && for /l %i in (1,1,30) do ({} && (powershell "{}" & exit) || ping -n 2 127.0.0.1 >nul)'.format(commandCheckMpi, commandCheckSmpd, commandMeasureTime.replace('[command]', commandMpiIntra)) commandRunIntra = '{} else echo off && for /l %i in (1,1,30) do ({} && (powershell "{}" & exit) || ping -n 2 127.0.0.1 >nul)'.format(commandCheckMpi, commandCheckSmpd, commandMeasureTime.replace('[command]', commandMpiIntra))
commandRunInter = '{} || if {} else {} && powershell "{}"'.format(commandCheckHost, commandCheckMpi, commandCheckSmpd, commandMeasureTime.replace('[command]', commandMpiInter)) commandRunInter = '{} || {} else {} && powershell "{}"'.format(commandCheckHost, commandCheckRdmaAndMpi if isRdma else commandCheckMpi, commandCheckSmpd, commandMeasureTime.replace('[command]', commandMpiInter))
else: else:
mpiEnvFile = r'{}\intel64\bin\mpivars.bat'.format(mpiLocation) mpiEnvFile = r'{}\intel64\bin\mpivars.bat'.format(mpiLocation)
commandSetFirewall = r'netsh firewall add allowedprogram "{}\intel64\bin\mpiexec.exe" hpc_diagnostics_mpiexec'.format(mpiLocation) # this way would only add one row in firewall rules commandSetFirewall = r'netsh firewall add allowedprogram "{}\intel64\bin\mpiexec.exe" hpc_diagnostics_mpiexec'.format(mpiLocation) # this way would only add one row in firewall rules
@ -817,8 +820,11 @@ def mpiPingpongGetFailedReasons(failedTasks, mpiVersion, canceledTasks):
reasonDapl = 'Error message: dapl fabric is not available and fallback fabric is not enabled' reasonDapl = 'Error message: dapl fabric is not available and fallback fabric is not enabled'
solutionDapl = 'Please check the RDMA driver availability and memory limit setting or re-create the VM.' solutionDapl = 'Please check the RDMA driver availability and memory limit setting or re-create the VM.'
reasonWindowsError1 = 'Error message: Error connecting to the Service' reasonHpcVmDriversNotInstalled = 'Windows network device drivers for RDMA connectivity is not installed.'
reasonWindowsError2 = 'Error message: The semaphore timeout period has expired' solutionHpcVmDriversNotInstalled = 'Install VM extension "HpcVmDrivers" on the node(s). More refer to https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-hpc.'
reasonWindowsError1 = 'Error message: Error connecting to the Service.'
reasonWindowsError2 = 'Error message: The semaphore timeout period has expired.'
failedReasons = {} failedReasons = {}
for failedPair in failedTasks: for failedPair in failedTasks:
@ -839,6 +845,11 @@ def mpiPingpongGetFailedReasons(failedTasks, mpiVersion, canceledTasks):
failedNode = pairedNode failedNode = pairedNode
failedPair['NodeOrPair'] = failedNode failedPair['NodeOrPair'] = failedNode
failedReasons.setdefault(reason, {'Reason':reason, 'Solution':solutionIntelMpiNotInstalled, 'Nodes':set()})['Nodes'].add(failedNode) failedReasons.setdefault(reason, {'Reason':reason, 'Solution':solutionIntelMpiNotInstalled, 'Nodes':set()})['Nodes'].add(failedNode)
elif "HpcVmDrivers is not installed" in output:
reason = reasonHpcVmDriversNotInstalled
failedNode = nodeName
failedPair['NodeOrPair'] = failedNode
failedReasons.setdefault(reason, {'Reason':reason, 'Solution':solutionHpcVmDriversNotInstalled, 'Nodes':set()})['Nodes'].add(failedNode)
elif "Microsoft MPI is not installed" in output: elif "Microsoft MPI is not installed" in output:
reason = reasonMsmpiNotInstalled reason = reasonMsmpiNotInstalled
failedNode = nodeName failedNode = nodeName
@ -901,6 +912,9 @@ def mpiPingpongGetFailedReasons(failedTasks, mpiVersion, canceledTasks):
nodesOrPairs = value.get(reasonMsmpiNotInstalled) nodesOrPairs = value.get(reasonMsmpiNotInstalled)
if nodesOrPairs: if nodesOrPairs:
value[reasonMsmpiNotInstalled] = list(set(nodesOrPairs)) value[reasonMsmpiNotInstalled] = list(set(nodesOrPairs))
nodesOrPairs = value.get(reasonHpcVmDriversNotInstalled)
if nodesOrPairs:
value[reasonHpcVmDriversNotInstalled] = list(set(nodesOrPairs))
for key in failedReasonsByNode.keys(): for key in failedReasonsByNode.keys():
severity = failedReasonsByNode[key].pop('Severity') severity = failedReasonsByNode[key].pop('Severity')
failedReasonsByNode["{} ({})".format(key, severity)] = failedReasonsByNode.pop(key) failedReasonsByNode["{} ({})".format(key, severity)] = failedReasonsByNode.pop(key)