зеркало из https://github.com/microsoft/archai.git
add dist main, distributed mode functional
This commit is contained in:
Родитель
a81c871435
Коммит
f13b652e35
|
@ -236,7 +236,7 @@ def _setup_logger():
|
|||
logs_yaml_filepath = utils.full_path(os.path.join(distdir, f'logs_{global_rank}.yaml'))
|
||||
experiment_name = get_experiment_name() + '_' + str(global_rank)
|
||||
enable_stdout = False
|
||||
print(f'No stdout logging for replica {global_rank}')
|
||||
print(f'log_global_rank={global_rank}, log_stdout={sys_log_filepath}, log_file={sys_log_filepath}')
|
||||
|
||||
sys_logger = utils.create_logger(filepath=sys_log_filepath,
|
||||
name=experiment_name,
|
||||
|
@ -247,9 +247,9 @@ def _setup_logger():
|
|||
|
||||
# reset to new file path
|
||||
logger.reset(logs_yaml_filepath, sys_logger)
|
||||
logger.info({'command_line': ' '.join(sys.argv[1:])})
|
||||
logger.info({
|
||||
'datetime:': datetime.datetime.now(),
|
||||
'command_line': ' '.join(sys.argv[1:]),
|
||||
'logger_global_rank': global_rank,
|
||||
'logger_enable_stdout': enable_stdout,
|
||||
'sys_log_filepath': sys_log_filepath
|
||||
|
|
|
@ -20,7 +20,6 @@ class Trainer(EnforceOverrides):
|
|||
checkpoint:Optional[CheckPoint])->None:
|
||||
# region config vars
|
||||
conf_lossfn = conf_train['lossfn']
|
||||
self._apex = conf_train['apex']
|
||||
self._aux_weight = conf_train['aux_weight']
|
||||
self._grad_clip = conf_train['grad_clip']
|
||||
self._drop_path_prob = conf_train['drop_path_prob']
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
#!/bin/bash
|
||||
#fail if any errors
|
||||
set -e
|
||||
|
||||
nvidia-smi --list-gpus
|
||||
|
||||
gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
||||
|
||||
echo "*****************************************"
|
||||
echo "Using $gpu_count GPUS"
|
||||
echo "*****************************************"
|
||||
|
||||
set -e -o xtrace
|
||||
|
||||
|
||||
python -m torch.distributed.launch --nproc_per_node=$gpu_count scripts/main.py $*
|
Загрузка…
Ссылка в новой задаче