After bug fix and refactoring. Getting ready to rerun experiments.

This commit is contained in:
Debadeepta Dey 2021-01-25 12:20:07 -08:00 коммит произвёл Gustavo Rosa
Родитель 4941e51829
Коммит 251eef76cd
7 изменённых файлов: 13 добавлений и 9 удалений

Просмотреть файл

@ -34,17 +34,17 @@ class ConditionalTrainer(ArchTrainer, EnforceOverrides):
super().__init__(conf_train, model, checkpoint)
# region config vars specific to freeze trainer
self._val_top1_acc = conf_train['val_top1_acc_threshold']
self._train_top1_acc_threshold = conf_train['train_top1_acc_threshold']
# endregion
@overrides
def _should_terminate(self):
# if current validation accuracy is above threshold
# terminate training
best_val_top1_avg = self._metrics.best_val_top1()
best_train_top1_avg = self._metrics.best_train_top1()
if best_val_top1_avg >= self._val_top1_acc:
logger.info(f'terminating at {best_val_top1_avg}')
if best_train_top1_avg >= self._train_top1_acc_threshold:
logger.info(f'terminating at {best_train_top1_avg}')
logger.info('----------terminating regular training---------')
return True
else:

Просмотреть файл

@ -59,7 +59,6 @@ class FreezeTrainer(ArchTrainer, EnforceOverrides):
# logger.info(f'{name} requires grad')
# Do it via parameters
# NOTE: freezing via named_parameters() doesn't expose all parameters? Check with Shital.
for param in self.model.parameters():
param.requires_grad = False

Просмотреть файл

@ -242,9 +242,11 @@ class Metrics:
def best_train_top1(self)->float:
return self.run_metrics.best_epoch()[0].top1.avg
def best_val_top1(self)->float:
val_epoch_metrics = self.run_metrics.best_epoch()[1]
return val_epoch_metrics.top1.avg if val_epoch_metrics is not None else math.nan
def best_test_top1(self)->float:
test_epoch_metrics = self.run_metrics.best_epoch()[2]
return test_epoch_metrics.top1.avg if test_epoch_metrics is not None else math.nan
@ -338,6 +340,7 @@ class RunMetrics:
def pre_run(self):
self.start_time = time.time()
def post_run(self, test_metrics:Optional['Metrics']=None):
self.end_time = time.time()
self.test_metrics = test_metrics
@ -359,6 +362,7 @@ class RunMetrics:
best_val = max(self.epochs_metrics,
key=lambda e:e.val_metrics.top1.avg if e.val_metrics else -1)
best_val = best_val.val_metrics if best_val.val_metrics else None
best_test = self.test_metrics.run_metrics.epochs_metrics[-1] \
@ -368,6 +372,7 @@ class RunMetrics:
def epoch_time_avg(self):
return statistics.mean((e.duration() for e in self.epochs_metrics))
def step_time_avg(self):
return statistics.mean((e.step_time.avg for e in self.epochs_metrics))

Просмотреть файл

@ -21,7 +21,7 @@ nas:
trainer:
plotsdir: ''
epochs: 2
val_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
train_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
train_regular: False # if False the full regular training of the architecture will be bypassed
freeze_trainer:

Просмотреть файл

@ -18,7 +18,7 @@ nas:
aux_weight: 0.0
grad_clip: 0.0
drop_path_prob: 0.0 # probability that given edge will be dropped
val_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
train_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
train_regular: True # if False the full regular training of the architecture will be bypassed
epochs: 200
optimizer:

Просмотреть файл

@ -20,7 +20,7 @@ nas:
train_batch: 256 # batch size for freeze training. 256 works with 5gb usage on 2080Ti.
trainer:
plotsdir: ''
val_top1_acc_threshold: 0.60 # after some accuracy we will shift into training only the last 'n' layers
train_top1_acc_threshold: 0.60 # after some accuracy we will shift into training only the last 'n' layers
apex:
_copy: '/common/apex'
aux_weight: '_copy: /nas/eval/model_desc/aux_weight'

Просмотреть файл

@ -21,7 +21,7 @@ nas:
train_batch: 2048 # batch size for freeze training. 2048 works reliably on V100 with cell13 onwards unfrozen
trainer:
plotsdir: ''
val_top1_acc_threshold: 0.3 # after some accuracy we will shift into training only the last 'n' layers
train_top1_acc_threshold: 0.4 # after some accuracy we will shift into training only the last 'n' layers
apex:
_copy: '/common/apex'
aux_weight: '_copy: /nas/eval/model_desc/aux_weight'