зеркало из https://github.com/Azure/hpcpack-acm.git
diag mpi pingpong: check HpcVmDrivers on RDMA Windows nodes
This commit is contained in:
Родитель
c47d00c788
Коммит
d545526bb2
|
@ -367,6 +367,7 @@ def mpiPingpongCreateTasksWindows(nodelist, isRdma, startId, mpiLocation, log, u
|
||||||
if len(nodelist) == 0:
|
if len(nodelist) == 0:
|
||||||
return tasks
|
return tasks
|
||||||
|
|
||||||
|
hpcVmDriversExtensionPath = r'C:\Packages\Plugins\Microsoft.HpcCompute.HpcVmDrivers'
|
||||||
sampleOption = '-msglog {}:{}'.format(log, log + 1) if -1 < log < 30 else '-iter 10'
|
sampleOption = '-msglog {}:{}'.format(log, log + 1) if -1 < log < 30 else '-iter 10'
|
||||||
rdmaOption = ''
|
rdmaOption = ''
|
||||||
taskLabel = '[Windows]'
|
taskLabel = '[Windows]'
|
||||||
|
@ -390,13 +391,15 @@ def mpiPingpongCreateTasksWindows(nodelist, isRdma, startId, mpiLocation, log, u
|
||||||
commandStartSmpd = 'type nul >smpdstarted && smpd -d || del smpdstarted'
|
commandStartSmpd = 'type nul >smpdstarted && smpd -d || del smpdstarted'
|
||||||
commandStopSmpd = 'if exist smpdstarted taskkill /f /im smpd.exe'
|
commandStopSmpd = 'if exist smpdstarted taskkill /f /im smpd.exe'
|
||||||
commandCheckHost = 'nslookup [nodepong] 2>&1 | findstr /C:"can\'t find [nodepong]"'
|
commandCheckHost = 'nslookup [nodepong] 2>&1 | findstr /C:"can\'t find [nodepong]"'
|
||||||
commandCheckMpi = 'not exist "%MSMPI_BIN%" (echo Microsoft MPI is not installed)'
|
commandCheckMpi = 'if not exist "%MSMPI_BIN%" (echo Microsoft MPI is not installed)'
|
||||||
commandCheckSmpd = 'tasklist /fi "imagename eq smpd.exe" | findstr smpd'
|
commandCheckSmpd = 'tasklist /fi "imagename eq smpd.exe" | findstr smpd'
|
||||||
|
commandCheckRdma = 'if not exist {} (echo HpcVmDrivers is not installed)'.format(hpcVmDriversExtensionPath)
|
||||||
|
commandCheckRdmaAndMpi = '{} else {}'.format(commandCheckRdma, commandCheckMpi)
|
||||||
commandSetEnvs = "$env:CCP_TASKCONTEXT=''; $env:path='%MSMPI_BIN%'"
|
commandSetEnvs = "$env:CCP_TASKCONTEXT=''; $env:path='%MSMPI_BIN%'"
|
||||||
commandMpiIntra = "{}; mpiexec -hosts 1 %COMPUTERNAME% 2 '%MSMPI_BENCHMARKS%IMB-MPI1' {} pingpong".format(commandSetEnvs, sampleOption)
|
commandMpiIntra = "{}; mpiexec -hosts 1 %COMPUTERNAME% 2 '%MSMPI_BENCHMARKS%IMB-MPI1' {} pingpong".format(commandSetEnvs, sampleOption)
|
||||||
commandMpiInter = "{}; mpiexec -hosts 2 [nodeping] 1 [nodepong] 1 '%MSMPI_BENCHMARKS%IMB-MPI1' -time 60 {} pingpong".format(commandSetEnvs, sampleOption)
|
commandMpiInter = "{}; mpiexec -hosts 2 [nodeping] 1 [nodepong] 1 '%MSMPI_BENCHMARKS%IMB-MPI1' -time 60 {} pingpong".format(commandSetEnvs, sampleOption)
|
||||||
commandRunIntra = 'if {} else echo off && for /l %i in (1,1,30) do ({} && (powershell "{}" & exit) || ping -n 2 127.0.0.1 >nul)'.format(commandCheckMpi, commandCheckSmpd, commandMeasureTime.replace('[command]', commandMpiIntra))
|
commandRunIntra = '{} else echo off && for /l %i in (1,1,30) do ({} && (powershell "{}" & exit) || ping -n 2 127.0.0.1 >nul)'.format(commandCheckMpi, commandCheckSmpd, commandMeasureTime.replace('[command]', commandMpiIntra))
|
||||||
commandRunInter = '{} || if {} else {} && powershell "{}"'.format(commandCheckHost, commandCheckMpi, commandCheckSmpd, commandMeasureTime.replace('[command]', commandMpiInter))
|
commandRunInter = '{} || {} else {} && powershell "{}"'.format(commandCheckHost, commandCheckRdmaAndMpi if isRdma else commandCheckMpi, commandCheckSmpd, commandMeasureTime.replace('[command]', commandMpiInter))
|
||||||
else:
|
else:
|
||||||
mpiEnvFile = r'{}\intel64\bin\mpivars.bat'.format(mpiLocation)
|
mpiEnvFile = r'{}\intel64\bin\mpivars.bat'.format(mpiLocation)
|
||||||
commandSetFirewall = r'netsh firewall add allowedprogram "{}\intel64\bin\mpiexec.exe" hpc_diagnostics_mpiexec'.format(mpiLocation) # this way would only add one row in firewall rules
|
commandSetFirewall = r'netsh firewall add allowedprogram "{}\intel64\bin\mpiexec.exe" hpc_diagnostics_mpiexec'.format(mpiLocation) # this way would only add one row in firewall rules
|
||||||
|
@ -817,8 +820,11 @@ def mpiPingpongGetFailedReasons(failedTasks, mpiVersion, canceledTasks):
|
||||||
reasonDapl = 'Error message: dapl fabric is not available and fallback fabric is not enabled'
|
reasonDapl = 'Error message: dapl fabric is not available and fallback fabric is not enabled'
|
||||||
solutionDapl = 'Please check the RDMA driver availability and memory limit setting or re-create the VM.'
|
solutionDapl = 'Please check the RDMA driver availability and memory limit setting or re-create the VM.'
|
||||||
|
|
||||||
reasonWindowsError1 = 'Error message: Error connecting to the Service'
|
reasonHpcVmDriversNotInstalled = 'Windows network device drivers for RDMA connectivity is not installed.'
|
||||||
reasonWindowsError2 = 'Error message: The semaphore timeout period has expired'
|
solutionHpcVmDriversNotInstalled = 'Install VM extension "HpcVmDrivers" on the node(s). More refer to https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-hpc.'
|
||||||
|
|
||||||
|
reasonWindowsError1 = 'Error message: Error connecting to the Service.'
|
||||||
|
reasonWindowsError2 = 'Error message: The semaphore timeout period has expired.'
|
||||||
|
|
||||||
failedReasons = {}
|
failedReasons = {}
|
||||||
for failedPair in failedTasks:
|
for failedPair in failedTasks:
|
||||||
|
@ -839,6 +845,11 @@ def mpiPingpongGetFailedReasons(failedTasks, mpiVersion, canceledTasks):
|
||||||
failedNode = pairedNode
|
failedNode = pairedNode
|
||||||
failedPair['NodeOrPair'] = failedNode
|
failedPair['NodeOrPair'] = failedNode
|
||||||
failedReasons.setdefault(reason, {'Reason':reason, 'Solution':solutionIntelMpiNotInstalled, 'Nodes':set()})['Nodes'].add(failedNode)
|
failedReasons.setdefault(reason, {'Reason':reason, 'Solution':solutionIntelMpiNotInstalled, 'Nodes':set()})['Nodes'].add(failedNode)
|
||||||
|
elif "HpcVmDrivers is not installed" in output:
|
||||||
|
reason = reasonHpcVmDriversNotInstalled
|
||||||
|
failedNode = nodeName
|
||||||
|
failedPair['NodeOrPair'] = failedNode
|
||||||
|
failedReasons.setdefault(reason, {'Reason':reason, 'Solution':solutionHpcVmDriversNotInstalled, 'Nodes':set()})['Nodes'].add(failedNode)
|
||||||
elif "Microsoft MPI is not installed" in output:
|
elif "Microsoft MPI is not installed" in output:
|
||||||
reason = reasonMsmpiNotInstalled
|
reason = reasonMsmpiNotInstalled
|
||||||
failedNode = nodeName
|
failedNode = nodeName
|
||||||
|
@ -901,6 +912,9 @@ def mpiPingpongGetFailedReasons(failedTasks, mpiVersion, canceledTasks):
|
||||||
nodesOrPairs = value.get(reasonMsmpiNotInstalled)
|
nodesOrPairs = value.get(reasonMsmpiNotInstalled)
|
||||||
if nodesOrPairs:
|
if nodesOrPairs:
|
||||||
value[reasonMsmpiNotInstalled] = list(set(nodesOrPairs))
|
value[reasonMsmpiNotInstalled] = list(set(nodesOrPairs))
|
||||||
|
nodesOrPairs = value.get(reasonHpcVmDriversNotInstalled)
|
||||||
|
if nodesOrPairs:
|
||||||
|
value[reasonHpcVmDriversNotInstalled] = list(set(nodesOrPairs))
|
||||||
for key in failedReasonsByNode.keys():
|
for key in failedReasonsByNode.keys():
|
||||||
severity = failedReasonsByNode[key].pop('Severity')
|
severity = failedReasonsByNode[key].pop('Severity')
|
||||||
failedReasonsByNode["{} ({})".format(key, severity)] = failedReasonsByNode.pop(key)
|
failedReasonsByNode["{} ({})".format(key, severity)] = failedReasonsByNode.pop(key)
|
||||||
|
|
Загрузка…
Ссылка в новой задаче