add dist main, distributed mode functional

This commit is contained in:
Shital Shah 2020-04-23 08:52:01 -07:00
Родитель a81c871435
Коммит f13b652e35
3 изменённых файлов: 18 добавлений и 3 удалений

Просмотреть файл

@ -236,7 +236,7 @@ def _setup_logger():
logs_yaml_filepath = utils.full_path(os.path.join(distdir, f'logs_{global_rank}.yaml'))
experiment_name = get_experiment_name() + '_' + str(global_rank)
enable_stdout = False
print(f'No stdout logging for replica {global_rank}')
print(f'log_global_rank={global_rank}, log_stdout={sys_log_filepath}, log_file={sys_log_filepath}')
sys_logger = utils.create_logger(filepath=sys_log_filepath,
name=experiment_name,
@ -247,9 +247,9 @@ def _setup_logger():
# reset to new file path
logger.reset(logs_yaml_filepath, sys_logger)
logger.info({'command_line': ' '.join(sys.argv[1:])})
logger.info({
'datetime:': datetime.datetime.now(),
'command_line': ' '.join(sys.argv[1:]),
'logger_global_rank': global_rank,
'logger_enable_stdout': enable_stdout,
'sys_log_filepath': sys_log_filepath

Просмотреть файл

@ -20,7 +20,6 @@ class Trainer(EnforceOverrides):
checkpoint:Optional[CheckPoint])->None:
# region config vars
conf_lossfn = conf_train['lossfn']
self._apex = conf_train['apex']
self._aux_weight = conf_train['aux_weight']
self._grad_clip = conf_train['grad_clip']
self._drop_path_prob = conf_train['drop_path_prob']

16
scripts/dist_main.sh Normal file
Просмотреть файл

@ -0,0 +1,16 @@
#!/bin/bash
#fail if any errors
set -e
nvidia-smi --list-gpus
gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
echo "*****************************************"
echo "Using $gpu_count GPUS"
echo "*****************************************"
set -e -o xtrace
python -m torch.distributed.launch --nproc_per_node=$gpu_count scripts/main.py $*