зеркало из https://github.com/microsoft/archai.git
After bug fix and refactoring. Getting ready to rerun experiments.
This commit is contained in:
Родитель
4941e51829
Коммит
251eef76cd
|
@ -34,17 +34,17 @@ class ConditionalTrainer(ArchTrainer, EnforceOverrides):
|
|||
super().__init__(conf_train, model, checkpoint)
|
||||
|
||||
# region config vars specific to freeze trainer
|
||||
self._val_top1_acc = conf_train['val_top1_acc_threshold']
|
||||
self._train_top1_acc_threshold = conf_train['train_top1_acc_threshold']
|
||||
# endregion
|
||||
|
||||
@overrides
|
||||
def _should_terminate(self):
|
||||
# if current validation accuracy is above threshold
|
||||
# terminate training
|
||||
best_val_top1_avg = self._metrics.best_val_top1()
|
||||
best_train_top1_avg = self._metrics.best_train_top1()
|
||||
|
||||
if best_val_top1_avg >= self._val_top1_acc:
|
||||
logger.info(f'terminating at {best_val_top1_avg}')
|
||||
if best_train_top1_avg >= self._train_top1_acc_threshold:
|
||||
logger.info(f'terminating at {best_train_top1_avg}')
|
||||
logger.info('----------terminating regular training---------')
|
||||
return True
|
||||
else:
|
||||
|
|
|
@ -59,7 +59,6 @@ class FreezeTrainer(ArchTrainer, EnforceOverrides):
|
|||
# logger.info(f'{name} requires grad')
|
||||
|
||||
# Do it via parameters
|
||||
# NOTE: freezing via named_parameters() doesn't expose all parameters? Check with Shital.
|
||||
for param in self.model.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
|
|
|
@ -242,9 +242,11 @@ class Metrics:
|
|||
|
||||
def best_train_top1(self)->float:
|
||||
return self.run_metrics.best_epoch()[0].top1.avg
|
||||
|
||||
def best_val_top1(self)->float:
|
||||
val_epoch_metrics = self.run_metrics.best_epoch()[1]
|
||||
return val_epoch_metrics.top1.avg if val_epoch_metrics is not None else math.nan
|
||||
|
||||
def best_test_top1(self)->float:
|
||||
test_epoch_metrics = self.run_metrics.best_epoch()[2]
|
||||
return test_epoch_metrics.top1.avg if test_epoch_metrics is not None else math.nan
|
||||
|
@ -338,6 +340,7 @@ class RunMetrics:
|
|||
|
||||
def pre_run(self):
|
||||
self.start_time = time.time()
|
||||
|
||||
def post_run(self, test_metrics:Optional['Metrics']=None):
|
||||
self.end_time = time.time()
|
||||
self.test_metrics = test_metrics
|
||||
|
@ -359,6 +362,7 @@ class RunMetrics:
|
|||
|
||||
best_val = max(self.epochs_metrics,
|
||||
key=lambda e:e.val_metrics.top1.avg if e.val_metrics else -1)
|
||||
|
||||
best_val = best_val.val_metrics if best_val.val_metrics else None
|
||||
|
||||
best_test = self.test_metrics.run_metrics.epochs_metrics[-1] \
|
||||
|
@ -368,6 +372,7 @@ class RunMetrics:
|
|||
|
||||
def epoch_time_avg(self):
|
||||
return statistics.mean((e.duration() for e in self.epochs_metrics))
|
||||
|
||||
def step_time_avg(self):
|
||||
return statistics.mean((e.step_time.avg for e in self.epochs_metrics))
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ nas:
|
|||
trainer:
|
||||
plotsdir: ''
|
||||
epochs: 2
|
||||
val_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
|
||||
train_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
|
||||
train_regular: False # if False the full regular training of the architecture will be bypassed
|
||||
|
||||
freeze_trainer:
|
||||
|
|
|
@ -18,7 +18,7 @@ nas:
|
|||
aux_weight: 0.0
|
||||
grad_clip: 0.0
|
||||
drop_path_prob: 0.0 # probability that given edge will be dropped
|
||||
val_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
|
||||
train_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
|
||||
train_regular: True # if False the full regular training of the architecture will be bypassed
|
||||
epochs: 200
|
||||
optimizer:
|
||||
|
|
|
@ -20,7 +20,7 @@ nas:
|
|||
train_batch: 256 # batch size for freeze training. 256 works with 5gb usage on 2080Ti.
|
||||
trainer:
|
||||
plotsdir: ''
|
||||
val_top1_acc_threshold: 0.60 # after some accuracy we will shift into training only the last 'n' layers
|
||||
train_top1_acc_threshold: 0.60 # after some accuracy we will shift into training only the last 'n' layers
|
||||
apex:
|
||||
_copy: '/common/apex'
|
||||
aux_weight: '_copy: /nas/eval/model_desc/aux_weight'
|
||||
|
|
|
@ -21,7 +21,7 @@ nas:
|
|||
train_batch: 2048 # batch size for freeze training. 2048 works reliably on V100 with cell13 onwards unfrozen
|
||||
trainer:
|
||||
plotsdir: ''
|
||||
val_top1_acc_threshold: 0.3 # after some accuracy we will shift into training only the last 'n' layers
|
||||
train_top1_acc_threshold: 0.4 # after some accuracy we will shift into training only the last 'n' layers
|
||||
apex:
|
||||
_copy: '/common/apex'
|
||||
aux_weight: '_copy: /nas/eval/model_desc/aux_weight'
|
||||
|
|
Загрузка…
Ссылка в новой задаче