After bug fix and refactoring. Getting ready to rerun experiments.

2021-01-25 12:20:07 -08:00 · 2021-01-25 12:20:07 -08:00 · 251eef76cd
--- a/archai/algos/proxynas/conditional_trainer.py
+++ b/archai/algos/proxynas/conditional_trainer.py
@ -34,17 +34,17 @@ class ConditionalTrainer(ArchTrainer, EnforceOverrides):
        super().__init__(conf_train, model, checkpoint)

        # region config vars specific to freeze trainer
-        self._val_top1_acc = conf_train['val_top1_acc_threshold']
+        self._train_top1_acc_threshold = conf_train['train_top1_acc_threshold']
        # endregion

    @overrides
    def _should_terminate(self):
        # if current validation accuracy is above threshold
        # terminate training
-        best_val_top1_avg = self._metrics.best_val_top1()
+        best_train_top1_avg = self._metrics.best_train_top1()

-        if best_val_top1_avg >= self._val_top1_acc:
-            logger.info(f'terminating at {best_val_top1_avg}')
+        if best_train_top1_avg >= self._train_top1_acc_threshold:
+            logger.info(f'terminating at {best_train_top1_avg}')
            logger.info('----------terminating regular training---------')
            return True
        else:
--- a/archai/algos/proxynas/freeze_trainer.py
+++ b/archai/algos/proxynas/freeze_trainer.py
@ -59,7 +59,6 @@ class FreezeTrainer(ArchTrainer, EnforceOverrides):
        #         logger.info(f'{name} requires grad')

        # Do it via parameters
-        # NOTE: freezing via named_parameters() doesn't expose all parameters? Check with Shital.
        for param in self.model.parameters():
            param.requires_grad = False

--- a/archai/common/metrics.py
+++ b/archai/common/metrics.py
@ -242,9 +242,11 @@ class Metrics:

    def best_train_top1(self)->float:
        return self.run_metrics.best_epoch()[0].top1.avg
+
    def best_val_top1(self)->float:
        val_epoch_metrics = self.run_metrics.best_epoch()[1]
        return val_epoch_metrics.top1.avg if val_epoch_metrics is not None else math.nan
+
    def best_test_top1(self)->float:
        test_epoch_metrics = self.run_metrics.best_epoch()[2]
        return test_epoch_metrics.top1.avg if test_epoch_metrics is not None else math.nan
@ -338,6 +340,7 @@ class RunMetrics:

    def pre_run(self):
        self.start_time = time.time()
+
    def post_run(self, test_metrics:Optional['Metrics']=None):
        self.end_time = time.time()
        self.test_metrics = test_metrics
@ -359,6 +362,7 @@ class RunMetrics:

        best_val = max(self.epochs_metrics,
            key=lambda e:e.val_metrics.top1.avg if e.val_metrics else -1)
+            
        best_val = best_val.val_metrics if best_val.val_metrics else None

        best_test = self.test_metrics.run_metrics.epochs_metrics[-1] \
@ -368,6 +372,7 @@ class RunMetrics:

    def epoch_time_avg(self):
        return statistics.mean((e.duration() for e in self.epochs_metrics))
+
    def step_time_avg(self):
        return statistics.mean((e.step_time.avg for e in self.epochs_metrics))

--- a/confs/algos/proxynas_darts_space.yaml
+++ b/confs/algos/proxynas_darts_space.yaml
@ -21,7 +21,7 @@ nas:
    trainer:
      plotsdir: ''
      epochs: 2
-      val_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
+      train_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
      train_regular: False # if False the full regular training of the architecture will be bypassed
      
    freeze_trainer:
--- a/confs/algos/proxynas_manual.yaml
+++ b/confs/algos/proxynas_manual.yaml
@ -18,7 +18,7 @@ nas:
      aux_weight: 0.0
      grad_clip: 0.0
      drop_path_prob: 0.0 # probability that given edge will be dropped
-      val_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
+      train_top1_acc_threshold: 0.10 # after some accuracy we will shift into training only the last 'n' layers
      train_regular: True # if False the full regular training of the architecture will be bypassed
      epochs: 200
      optimizer:
--- a/confs/algos/proxynas_nasbench101_space.yaml
+++ b/confs/algos/proxynas_nasbench101_space.yaml
@ -20,7 +20,7 @@ nas:
        train_batch: 256 # batch size for freeze training. 256 works with 5gb usage on 2080Ti.
    trainer:
      plotsdir: ''
-      val_top1_acc_threshold: 0.60 # after some accuracy we will shift into training only the last 'n' layers
+      train_top1_acc_threshold: 0.60 # after some accuracy we will shift into training only the last 'n' layers
      apex:
        _copy: '/common/apex'
      aux_weight: '_copy: /nas/eval/model_desc/aux_weight'
--- a/confs/algos/proxynas_natsbench_space.yaml
+++ b/confs/algos/proxynas_natsbench_space.yaml
@ -21,7 +21,7 @@ nas:
        train_batch: 2048 # batch size for freeze training. 2048 works reliably on V100 with cell13 onwards unfrozen
    trainer:
      plotsdir: ''
-      val_top1_acc_threshold: 0.3 # after some accuracy we will shift into training only the last 'n' layers
+      train_top1_acc_threshold: 0.4 # after some accuracy we will shift into training only the last 'n' layers
      apex:
        _copy: '/common/apex'
      aux_weight: '_copy: /nas/eval/model_desc/aux_weight'