Auto generate ibstat file for topo aware traffic pattern (#381)

An enhancement for topo-aware IB performance validation #373.
This PR will auto-generate a required ibstate file `ib_traffic_topo_aware_ibstat.txt` which is used as input to build a graph.
This commit is contained in:
Yang Wang 2022-08-13 18:20:42 +08:00 коммит произвёл GitHub
Родитель b5c7c85d17
Коммит faeee0a7cc
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 54 добавлений и 7 удалений

Просмотреть файл

@ -18,7 +18,7 @@ steps:
echo "##vso[task.prependpath]$HOME/.local/bin"
displayName: Export path
- script: |
python3 -m pip install .[test,nvidia,torch,ort]
python3 -m pip install .[test,nvidia,torch,ort,mpi]
make postinstall
displayName: Install dependencies
- script: |

Просмотреть файл

@ -128,6 +128,6 @@ ADD third_party third_party
RUN make -C third_party cuda
ADD . .
RUN python3 -m pip install .[nvidia,torch,ort] && \
RUN python3 -m pip install .[nvidia,torch,ort,mpi] && \
make cppbuild && \
make postinstall

Просмотреть файл

@ -124,6 +124,6 @@ ADD third_party third_party
RUN make -C third_party rocm
ADD . .
RUN python3 -m pip install .[torch,ort] && \
RUN python3 -m pip install .[torch,ort,mpi] && \
make cppbuild && \
make postinstall

Просмотреть файл

@ -139,6 +139,6 @@ ADD third_party third_party
RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm
ADD . .
RUN python3 -m pip install .[torch,ort] && \
RUN python3 -m pip install .[torch,ort,mpi] && \
make cppbuild && \
make postinstall

Просмотреть файл

@ -191,6 +191,7 @@ setup(
'torchvision>=0.8.0a0',
'transformers>=4.3.3',
],
'mpi': ['mpi4py>=3.1.3'],
},
include_package_data=True,
entry_points={

Просмотреть файл

@ -4,7 +4,11 @@
"""Topology Aware Utilities."""
import re
import os
from pathlib import Path
import networkx as nx
from superbench.common.utils import logger
@ -31,6 +35,39 @@ class quick_regexp(object):
return self.matched
def gen_ibstat_file(ibstat_file):
"""Generate ibstat file for each node with specified path.
Args:
ibstat_file (str): path of ibstat output.
"""
from mpi4py import MPI
if not MPI.Is_initialized():
MPI.Init()
comm = MPI.COMM_WORLD
name = MPI.Get_processor_name()
# The command to fetch ibstat info
cmd = r"ibstat | grep -Po 'System image GUID: \K\S+$'"
output = os.popen(cmd)
ibstat = 'VM_hostname ' + name + '\n' + str(output.read())
# Fetch all ibstate from each node
ibstats = comm.allgather(ibstat)
ibstate_file_path = Path(ibstat_file)
# Filter the duplicate info
ibstat_infos = set(ibstats)
with ibstate_file_path.open(mode='w') as f:
for ibstat_info in ibstat_infos:
f.write(ibstat_info)
MPI.Finalize()
def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist, max_dist): # noqa: C901
"""Generate topology aware config list in specified distance range.
@ -47,15 +84,24 @@ def gen_topo_aware_config(host_list, ibstat_file, ibnetdiscover_file, min_dist,
topology distance (#hops).
"""
config = []
if not ibstat_file or not ibnetdiscover_file:
logger.error('Either ibstat or ibnetdiscover not specified.')
# Check validity of input parameters
if not ibnetdiscover_file:
logger.error('ibnetdiscover file is not specified.')
return config
if not ibstat_file:
ibstat_file = os.path.join(os.environ.get('SB_WORKSPACE', '.'), 'ib_traffic_topo_aware_ibstat.txt')
gen_ibstat_file(ibstat_file)
if not Path(ibstat_file).exists():
logger.error('ibstat file does not exist.')
return config
if min_dist > max_dist:
logger.error('Specified minimum distane ({}) is larger than maximum distance ({}).'.format(min_dist, max_dist))
return config
# index each hostname in hostfile
# Index each hostname in hostfile
host_idx = dict()
idx = 0
for h in host_list: