Auto generate ibstat file for topo aware traffic pattern (#381)
An enhancement for topo-aware IB performance validation #373. This PR will auto-generate a required ibstate file `ib_traffic_topo_aware_ibstat.txt` which is used as input to build a graph.
This commit is contained in:
Родитель
b5c7c85d17
Коммит
faeee0a7cc
|
@ -18,7 +18,7 @@ steps:
|
|||
echo "##vso[task.prependpath]$HOME/.local/bin"
|
||||
displayName: Export path
|
||||
- script: |
|
||||
python3 -m pip install .[test,nvidia,torch,ort]
|
||||
python3 -m pip install .[test,nvidia,torch,ort,mpi]
|
||||
make postinstall
|
||||
displayName: Install dependencies
|
||||
- script: |
|
||||
|
|
|
@ -128,6 +128,6 @@ ADD third_party third_party
|
|||
RUN make -C third_party cuda
|
||||
|
||||
ADD . .
|
||||
RUN python3 -m pip install .[nvidia,torch,ort] && \
|
||||
RUN python3 -m pip install .[nvidia,torch,ort,mpi] && \
|
||||
make cppbuild && \
|
||||
make postinstall
|
||||
|
|
|
@ -124,6 +124,6 @@ ADD third_party third_party
|
|||
RUN make -C third_party rocm
|
||||
|
||||
ADD . .
|
||||
RUN python3 -m pip install .[torch,ort] && \
|
||||
RUN python3 -m pip install .[torch,ort,mpi] && \
|
||||
make cppbuild && \
|
||||
make postinstall
|
||||
|
|
|
@ -139,6 +139,6 @@ ADD third_party third_party
|
|||
RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm
|
||||
|
||||
ADD . .
|
||||
RUN python3 -m pip install .[torch,ort] && \
|
||||
RUN python3 -m pip install .[torch,ort,mpi] && \
|
||||
make cppbuild && \
|
||||
make postinstall
|
||||
|
|
1
setup.py
1
setup.py
|
@ -191,6 +191,7 @@ setup(
|
|||
'torchvision>=0.8.0a0',
|
||||
'transformers>=4.3.3',
|
||||
],
|
||||
'mpi': ['mpi4py>=3.1.3'],
|
||||
},
|
||||
include_package_data=True,
|
||||
entry_points={
|
||||
|
|
|
@ -4,7 +4,11 @@
|
|||
"""Topology Aware Utilities."""
|
||||
|
||||
import re
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import networkx as nx
|
||||
|
||||
from superbench.common.utils import logger
|
||||
|
||||
|
||||
|
@ -31,6 +35,39 @@ class quick_regexp(object):
|
|||
return self.matched
|
||||
|
||||
|
||||
def gen_ibstat_file(ibstat_file):
|
||||
"""Generate ibstat file for each node with specified path.
|
||||
|
||||
Args:
|
||||
ibstat_file (str): path of ibstat output.
|
||||
"""
|
||||
from mpi4py import MPI
|
||||
|
||||
if not MPI.Is_initialized():
|
||||
MPI.Init()
|
||||
|
||||
comm = MPI.COMM_WORLD
|
||||
name = MPI.Get_processor_name()
|
||||
|
||||
# The command to fetch ibstat info
|
||||
cmd = r"ibstat | grep -Po 'System image GUID: \K\S+$'"
|
||||
output = os.popen(cmd)
|
||||
ibstat = 'VM_hostname ' + name + '\n' + str(output.read())
|
||||
|
||||
# Fetch all ibstate from each node
|
||||
ibstats = comm.allgather(ibstat)
|
||||
|
||||
ibstate_file_path = Path(ibstat_file)
|
||||
|
||||
# Filter the duplicate info
|
||||
ibstat_infos = set(ibstats)
|
||||
|
||||
with ibstate_file_path.open(mode='w') as f:
|
||||
for ibstat_info in ibstat_infos:
|
||||
f.write(ibstat_info)
|
||||
MPI.Finalize()
|
||||
|
||||
|
||||
def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist, max_dist): # noqa: C901
|
||||
"""Generate topology aware config list in specified distance range.
|
||||
|
||||
|
@ -47,15 +84,24 @@ def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist,
|
|||
topology distance (#hops).
|
||||
"""
|
||||
config = []
|
||||
if not ibstat_file or not ibnetdiscover_file:
|
||||
logger.error('Either ibstat or ibnetdiscover not specified.')
|
||||
# Check validity of input parameters
|
||||
if not ibnetdiscover_file:
|
||||
logger.error('ibnetdiscover file is not specified.')
|
||||
return config
|
||||
|
||||
if not ibstat_file:
|
||||
ibstat_file = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'ib_traffic_topo_aware_ibstat.txt')
|
||||
gen_ibstat_file(ibstat_file)
|
||||
|
||||
if not Path(ibstat_file).exists():
|
||||
logger.error('ibstat file does not exist.')
|
||||
return config
|
||||
|
||||
if min_dist > max_dist:
|
||||
logger.error('Specified minimum distane ({}) is larger than maximum distance ({}).'.format(min_dist, max_dist))
|
||||
return config
|
||||
|
||||
# index each hostname in hostfile
|
||||
# Index each hostname in hostfile
|
||||
host_idx = dict()
|
||||
idx = 0
|
||||
for h in host_list:
|
||||
|
|
Загрузка…
Ссылка в новой задаче